In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
admission_df = pd.read_csv('Admission_predict.csv')

print(admission_df.head())

In [None]:
admission_df.drop('Serial No.', axis= 1, inplace = True)

print(admission_df.isnull().sum())

In [None]:
print(admission_df.info())

In [None]:
print(admission_df.describe())

Grouping based on university ratings

In [None]:
univ_df = admission_df.groupby(by = 'University Rating').mean()
print(univ_df)

ignoring float in cgpa column and grouping

In [None]:
cgparange_df = admission_df.copy()
cgparange_df['CGPA'] = cgparange_df['CGPA'].astype('int')
print(cgparange_df.groupby(by = 'CGPA').mean())

Histograms

In [None]:
admission_df.hist(column = 'CGPA', bins = 5,figsize = (5,5), color ='red')
admission_df.hist(column = 'GRE Score', bins = 10,figsize = (7,7), color ='red')

sns.pairplot(admission_df, x_vars = ('GRE Score','University Rating','CGPA'), y_vars = ('TOEFL Score','SOP'))

correlation matrix

In [None]:
corr_matrix = admission_df.corr()
plt.figure(figsize =(15,15))
sns.heatmap(corr_matrix, annot = True)
plt.show()

In [10]:
X = admission_df.iloc[:,:-1]
y = admission_df.iloc[:,-1]

X.shape
y.shape

X = np.array(X)
y = np.array(y)

y = y.reshape(-1,1)

Scaling data

In [11]:
from sklearn.preprocessing import StandardScaler
scalerx = StandardScaler()
scalery = StandardScaler()
X = scalerx.fit_transform(X)
y = scalery.fit_transform(y)

Splitting data into train and test set

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 1)

Trainig and evaluating a linear regression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LinerRegression_model = LinearRegression()
LinerRegression_model.fit(X_train, y_train)

accuracy_LinearRegression = LinerRegression_model.score(X_test, y_test)
print(accuracy_LinearRegression)

Training and evaluating an ANN model

In [None]:
import tensorflow as tf


ANN_model = tf.keras.models.Sequential()
ANN_model.add(tf.keras.layers.Dense(50, input_dim = 7, activation='relu'))

ANN_model.add(tf.keras.layers.Dense(150, activation='relu'))
tf.keras.layers.Dropout(0.5)

ANN_model.add(tf.keras.layers.Dense(150, activation='relu'))
tf.keras.layers.Dropout(0.5)

ANN_model.add(tf.keras.layers.Dense(50, activation='linear'))
ANN_model.add(tf.keras.layers.Dense(1))

ANN_model.compile(loss = 'mse', optimizer = 'adam')
ANN_model.summary()



epochs_hist = ANN_model.fit(X_train, y_train, epochs = 100, batch_size = 20)
y_pred_ann = ANN_model.predict(X_test)
print(y_pred_ann)


result = ANN_model.evaluate(X_test, y_test)
accuracy_ANN = 1-result

print("Accuracy :{}".format(accuracy_ANN))

epochs_hist.history.keys()

plt.plot(epochs_hist.history['loss'])
plt.title('model loss during training')
plt.xlabel('epochs')
plt.ylabel('training loss')

Training and evaluating decision trees and random forest regressors

In [None]:
from sklearn.tree import DecisionTreeRegressor

DecisionTree_model = DecisionTreeRegressor()
DecisionTree_model.fit(X_train, y_train)

accuracy_DecisionTree = DecisionTree_model.score(X_test, y_test)
print(accuracy_DecisionTree)

In [None]:
from sklearn.ensemble import RandomForestRegressor

RandomForest_model = RandomForestRegressor(n_estimators =100, max_depth =10)
RandomForest_model.fit(X_train, y_train)

accuracy_RandomForest = RandomForest_model.score(X_test, y_test) 
print(accuracy_RandomForest)

Calculating regression model KPIs

In [None]:
y_pred = LinerRegression_model.predict(X_test)

plt.plot(y_test, y_pred, '^', color = 'r')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(y_pred)

y_pred_orig = scaler.inverse_transform(y_pred) 

" we wanna plot in the original units, not in the scaled units"

y_test_orig = scaler.inverse_transform(y_test)

plt.plot(y_test_orig, y_pred_orig, '^', color = 'r')

" we get chance of admit with values between  and 1 "

In [None]:
k = X_test.shape[1]
n = len(X_test)
print(k)
print(n)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test_orig, y_pred_orig)), '.3f')) 
MSE = mean_squared_error(y_test_orig, y_pred_orig)
MAE = mean_absolute_error(y_test_orig, y_pred_orig)
r2 = r2_score(y_test_orig, y_pred_orig)

adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =', RMSE,'\nMSE =', MSE,'\nMAE =', MAE,'\nR2 =', r2,'\nAdjusted R2 =', adj_r2)