# Class 2 - Main Book

In [1]:
# Imports..
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, LassoCV, RidgeCV, LarsCV, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 
import os

from pandas_profiling import ProfileReport

In [2]:
# PARAMS..
models_folder = '../MLModels'

data_path = '../raw_data/Admission_Prediction.csv'

In [3]:
# Read Data..
df = pd.read_csv(data_path)
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.00,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.80
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332.0,108.0,5.0,4.5,4.0,9.02,1,0.87
496,497,337.0,117.0,5.0,5.0,5.0,9.87,1,0.96
497,498,330.0,120.0,5.0,4.5,5.0,9.56,1,0.93
498,499,312.0,103.0,4.0,4.0,5.0,8.43,0,0.73


In [5]:
# Build a Model to calculate chance of Admit..
ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Steps we are taking:
  * Handle Missing Values 
  * Make Features and Labels.
      * Do Standardization Scaling
  * Check Multi-Colinearity (Calculate VIF)

## Handling Missing Values

In [6]:
# Handle Missing Values..

# Just Fill Mean now (Ideally we have to use Imputing, but as now, it's not available..)
df['GRE Score'] = df['GRE Score'].fillna(df['GRE Score'].mean())
df['TOEFL Score'] = df['TOEFL Score'].fillna(df['TOEFL Score'].mean())
df['University Rating'] = df['University Rating'].fillna(df['University Rating'].mean())

In [7]:
# Missing values are gone..
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [8]:
# Drop Serial No.
df = df.drop(columns=['Serial No.'])
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.000000,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.000000,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.00,1,0.72
3,322.000000,110.0,3.0,3.5,2.5,8.67,1,0.80
4,314.000000,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1,0.87
496,337.000000,117.0,5.0,5.0,5.0,9.87,1,0.96
497,330.000000,120.0,5.0,4.5,5.0,9.56,1,0.93
498,312.000000,103.0,4.0,4.0,5.0,8.43,0,0.73


## Make Features and Labels

In [9]:
y = df['Chance of Admit']
X = df.drop(columns=['Chance of Admit'])

### Do Normalization / Standardization..

* If this is not done, the dataset will have a lot of variances between features.. it will be tough for the model to understand concrete relations.
* The Model may behave badly..

In [11]:
## TEST..
# Model Without Standard Scaling..
# Make X, Y..

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=100)
lr = LinearRegression()

lr_model = lr.fit(X_train, y_train)

print(df.iloc[0])
lr_model.predict([[337.00,118.0,4.0,4.5,4.5,9.65,1]])

GRE Score            337.00
TOEFL Score          118.00
University Rating      4.00
SOP                    4.50
LOR                    4.50
CGPA                   9.65
Research               1.00
Chance of Admit        0.92
Name: 0, dtype: float64


array([0.95359739])

### Standard Scale

In [12]:
# Model with Standard Scaling..
def standard_scale_data(X):
    scaler = StandardScaler()
    arr = scaler.fit_transform(X)
    X_st = pd.DataFrame(arr)
    # Scaler is also a model, I can save it..
    pickle.dump(scaler,open(models_folder+'/lr_model_scaler.scl','wb'))
    return X_st, scaler

X_st, st_scaler = standard_scale_data(X)
X_st.set_axis(X.columns,axis=1,inplace=True)
X_st

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,1.842741e+00,1.788542,0.778906,1.137360,1.098944,1.776806,0.886405
1,6.708143e-01,-0.031058,0.778906,0.632315,1.098944,0.485859,0.886405
2,5.124333e-15,-0.527313,-0.107877,-0.377773,0.017306,-0.954043,0.886405
3,4.905178e-01,0.465197,-0.107877,0.127271,-1.064332,0.154847,0.886405
4,-2.306679e-01,-0.692731,-0.994659,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.392000e+00,0.134360,1.665688,1.137360,0.558125,0.734118,0.886405
496,1.842741e+00,1.623124,1.665688,1.642404,1.639763,2.140919,0.886405
497,1.211704e+00,2.119379,1.665688,1.137360,1.639763,1.627851,0.886405
498,-4.109644e-01,-0.692731,0.778906,0.632315,1.639763,-0.242367,-1.128152


In [13]:
# Post-Scaling Profiling Report
X_st.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [14]:
X_st

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,1.842741e+00,1.788542,0.778906,1.137360,1.098944,1.776806,0.886405
1,6.708143e-01,-0.031058,0.778906,0.632315,1.098944,0.485859,0.886405
2,5.124333e-15,-0.527313,-0.107877,-0.377773,0.017306,-0.954043,0.886405
3,4.905178e-01,0.465197,-0.107877,0.127271,-1.064332,0.154847,0.886405
4,-2.306679e-01,-0.692731,-0.994659,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.392000e+00,0.134360,1.665688,1.137360,0.558125,0.734118,0.886405
496,1.842741e+00,1.623124,1.665688,1.642404,1.639763,2.140919,0.886405
497,1.211704e+00,2.119379,1.665688,1.137360,1.639763,1.627851,0.886405
498,-4.109644e-01,-0.692731,0.778906,0.632315,1.639763,-0.242367,-1.128152


In [15]:
# After Scaling, 
# Mean is almost 0,
# Variance is Approximately 1.
X_st.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,4.35052e-15,9.419132e-16,5.608847e-16,2.926548e-16,-1.3322680000000001e-17,3.091971e-15,-2.202682e-16
std,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002
min,-2.394225,-2.512331,-1.881441,-2.39795,-2.686789,-2.940115,-1.128152
25%,-0.681409,-0.692731,-0.9946589,-0.8828175,-0.5235128,-0.7430227,-1.128152
50%,5.124333e-15,-0.03105811,-0.1078766,0.1272712,0.01730621,-0.02720919,0.8864053
75%,0.6708143,0.796033,0.7789057,0.6323155,0.5581253,0.7672196,0.8864053
max,2.113186,2.119379,1.665688,1.642404,1.639763,2.223672,0.8864053


In [16]:
np.array(X_st)

array([[ 1.84274116e+00,  1.78854223e+00,  7.78905651e-01, ...,
         1.09894429e+00,  1.77680627e+00,  8.86405260e-01],
       [ 6.70814288e-01, -3.10581135e-02,  7.78905651e-01, ...,
         1.09894429e+00,  4.85859428e-01,  8.86405260e-01],
       [ 5.12433309e-15, -5.27312752e-01, -1.07876604e-01, ...,
         1.73062093e-02, -9.54042814e-01,  8.86405260e-01],
       ...,
       [ 1.21170361e+00,  2.11937866e+00,  1.66568791e+00, ...,
         1.63976333e+00,  1.62785086e+00,  8.86405260e-01],
       [-4.10964364e-01, -6.92730965e-01,  7.78905651e-01, ...,
         1.63976333e+00, -2.42366993e-01, -1.12815215e+00],
       [ 9.41258951e-01,  9.61451165e-01,  7.78905651e-01, ...,
         1.09894429e+00,  7.67219636e-01, -1.12815215e+00]])

# Check Multi-Colinearity
* Check VIF (Variance Inflation Factor)

In [17]:
# Do VIF..
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [18]:
vif_df_st = pd.DataFrame()

vif_df_st['Vif'] = [variance_inflation_factor(np.array(X_st),i) for i in range(X_st.shape[1])]
vif_df_st['Feature'] = X.columns

In [19]:
# Vif for Each Column..

vif_df_st

# Vif is Less than 10, dont drop any columns.. or do Dimensionality Reduction (PCA, t-SNE, etc)

Unnamed: 0,Vif,Feature
0,4.153268,GRE Score
1,3.792866,TOEFL Score
2,2.508768,University Rating
3,2.77575,SOP
4,2.037308,LOR
5,4.65167,CGPA
6,1.459311,Research


In [20]:
X_st

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,1.842741e+00,1.788542,0.778906,1.137360,1.098944,1.776806,0.886405
1,6.708143e-01,-0.031058,0.778906,0.632315,1.098944,0.485859,0.886405
2,5.124333e-15,-0.527313,-0.107877,-0.377773,0.017306,-0.954043,0.886405
3,4.905178e-01,0.465197,-0.107877,0.127271,-1.064332,0.154847,0.886405
4,-2.306679e-01,-0.692731,-0.994659,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.392000e+00,0.134360,1.665688,1.137360,0.558125,0.734118,0.886405
496,1.842741e+00,1.623124,1.665688,1.642404,1.639763,2.140919,0.886405
497,1.211704e+00,2.119379,1.665688,1.137360,1.639763,1.627851,0.886405
498,-4.109644e-01,-0.692731,0.778906,0.632315,1.639763,-0.242367,-1.128152


In [21]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

# Train and Test Split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_st, y, test_size=0.25,random_state=63)

X_train

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
87,3.977674e-02,-0.031058,-0.994659,0.127271,-0.523513,-0.490626,-1.128152
110,-1.042002e+00,0.134360,1.665688,-0.377773,-0.523513,-0.159614,-1.128152
185,9.412590e-01,0.961451,0.778906,1.137360,1.098944,0.883074,0.886405
159,-1.763188e+00,-1.188986,-1.881441,-1.892906,-1.605151,-1.119549,-1.128152
237,1.121555e+00,1.126869,1.665688,1.137360,1.639763,1.015479,0.886405
...,...,...,...,...,...,...,...
338,5.124333e-15,0.134360,1.665688,0.632315,0.558125,0.270702,0.886405
215,1.211704e+00,1.457706,1.665688,1.642404,1.098944,1.296839,0.886405
139,1.299250e-01,0.299778,-1.881441,0.127271,0.017306,0.899624,-1.128152
116,-1.582891e+00,-0.858149,-0.107877,0.632315,0.017306,0.072094,-1.128152


### Define Calc scoring

In [23]:
def calc_scores(model,X,y,should_print=True):
    r2 = model.score(X,y)
    n = X.shape[0]
    p = X.shape[1]
    adj_r2 = 1-(1-r2)*(n-1) / (n-p-1)
    
    # R2 Score..
    if should_print:
        print('R2 Score: ',r2)
        # This is lil bit less than R2..
        print('Adj R2 Score: ',adj_r2)
    return r2, adj_r2

## Model 1: Linear Regression

In [24]:
def train_linear_regression(X_train, X_test, y_train, y_test):
    lr = LinearRegression()
    lr_model = lr.fit(X_train, y_train)
    calc_scores(lr_model,X_train,y_train)
    pickle.dump(lr_model,open(models_folder+'/lr_model.pkl','wb'))
    return lr_model

lr_model = train_linear_regression(X_train, X_test, y_train, y_test)

R2 Score:  0.8541118588684492
Adj R2 Score:  0.8513292512719346


In [26]:
# Do PRediction for 1.. first..
print(df.iloc[1])
lr_model.predict([[324.000000,107.0,4,4,4.5,8.87,1]])
# This is not close.. and is wrong..

GRE Score            324.00
TOEFL Score          107.00
University Rating      4.00
SOP                    4.00
LOR                    4.50
CGPA                   8.87
Research               1.00
Chance of Admit        0.76
Name: 1, dtype: float64


array([11.66178172])

In [27]:
# Do Standard Scaling and then Predict..

lr_model.predict(st_scaler.transform([[324.000000,107.0,4,4,4.5,8.87,1]]))

# This is Much Much Closer to the actual value.. 0.76..

array([0.80896633])

### Imp: Do whatever Transformations done with the Training Data, Do the same with the Testing Dataset..

In [28]:
# Now, predict everything..

y_preds = lr_model.predict(X_test)
y_preds

array([0.70767621, 0.67492158, 0.79634793, 0.778021  , 0.73874621,
       0.62537565, 0.9200858 , 0.68965261, 0.8604571 , 0.5763828 ,
       0.72266557, 0.55740979, 0.51736617, 0.66687986, 0.70549093,
       0.84779687, 0.7347685 , 0.62653358, 0.522743  , 0.80222785,
       0.84617252, 0.98363583, 0.69858717, 0.83807813, 0.79835355,
       0.59874728, 0.4963933 , 0.53048272, 0.6328076 , 0.73977083,
       0.57892138, 0.57598785, 0.77447613, 0.76621958, 0.78037093,
       0.62728043, 0.85690919, 0.84129302, 0.66495642, 0.65082834,
       0.83562705, 0.64868304, 0.93623698, 0.8958044 , 0.66742992,
       0.99001947, 0.60302237, 0.86553847, 0.77474064, 0.74563785,
       0.58090335, 0.94091779, 0.58869805, 0.91150984, 0.44864165,
       0.8831171 , 0.51728911, 0.80896633, 0.6295613 , 0.61523708,
       0.84465602, 0.67174504, 0.70885297, 0.846631  , 0.62989585,
       0.62000462, 0.84992604, 0.74850214, 0.55404091, 0.77439214,
       0.73043262, 0.72319575, 0.6419744 , 0.78343847, 0.62346

In [29]:
lr_model.coef_ , lr_model.intercept_

(array([0.02641535, 0.01572948, 0.00015332, 0.00709804, 0.01551194,
        0.06556026, 0.01406587]),
 0.7257595504842049)

## Do Regularization..

## Model 2: Lasso CV
### Using Lasso CV along with Linear Regression..

In [30]:
def train_lassocv_regression(X_train, X_test, y_train, y_test):
    lasso_cv = LassoCV(alphas=None,cv=50, max_iter=2000, normalize=True)
    # Lasso Model Building..
    lasso_cv.fit(X_train,y_train)
    calc_scores(lasso_cv,X_train,y_train)
    pickle.dump(lasso_cv,open(models_folder+'/lassocv_model.pkl','wb'))
    return lasso_cv

lasso_cv = train_lassocv_regression(X_train, X_test, y_train, y_test)

R2 Score:  0.8540292368308482
Adj R2 Score:  0.8512450533371586


In [31]:
lasso_cv.alpha_

5.121925809985907e-05

In [32]:
# Prediction..
y_pred_reg = lasso_cv.predict(X_test)
y_pred_reg

array([0.70813606, 0.67568022, 0.79599017, 0.77699107, 0.73845151,
       0.62679126, 0.91799966, 0.69019007, 0.85856965, 0.57770615,
       0.72339444, 0.55924003, 0.5183673 , 0.66722096, 0.70504543,
       0.84648296, 0.73488819, 0.62758029, 0.52505071, 0.80175488,
       0.84488071, 0.98158991, 0.69855421, 0.83738595, 0.79750366,
       0.59930282, 0.49756385, 0.53213646, 0.63302617, 0.73931457,
       0.58089648, 0.57585835, 0.7749972 , 0.76623977, 0.77991286,
       0.62892564, 0.85555945, 0.83995849, 0.66461707, 0.65095492,
       0.83455983, 0.64994193, 0.93481644, 0.89385348, 0.66775576,
       0.98842818, 0.60425628, 0.86434749, 0.77475385, 0.74488294,
       0.58131597, 0.93940083, 0.58990538, 0.91030112, 0.45118699,
       0.88163349, 0.5186067 , 0.807805  , 0.63074435, 0.61658189,
       0.84399803, 0.6728293 , 0.70870815, 0.84563069, 0.63003074,
       0.62110358, 0.84887311, 0.74871944, 0.55485999, 0.77351264,
       0.73035113, 0.72429457, 0.6434195 , 0.78270863, 0.62527

## Model 3: Using RidgeCV along with Linear Regression..

In [33]:
def train_ridgecv_regression(X_train, X_test, y_train, y_test):
    ridge_cv = RidgeCV(alphas=np.random.uniform(0,10,50),cv=10, normalize=True)
    # Ridge Model Building..
    ridge_cv.fit(X_train,y_train)
    calc_scores(ridge_cv,X_train,y_train)
    pickle.dump(ridge_cv,open(models_folder+'/ridgecv_model.pkl','wb'))
    return lasso_cv

ridge_cv = train_ridgecv_regression(X_train, X_test, y_train, y_test)

R2 Score:  0.852883578019349
Adj R2 Score:  0.8500775427227153


In [34]:
ridge_cv.alpha_

5.121925809985907e-05

In [35]:
## Pass in the Alpha calcualted with RidgeCV to Ridge.. some random values..
ridge_lr = Ridge(alpha=ridge_cv.alpha_)
ridge_lr.fit(X_train, y_train)

Ridge(alpha=5.121925809985907e-05)

In [36]:
# Score of both RidgeCV and Ridge..

ridge_cv.score(X_test,y_test) , ridge_lr.score(X_test,y_test)

(0.7263158397064022, 0.7267084869947422)

## Model 4: Using ElasticNetCV along with Linear Regression..

In [38]:
def train_elasticnetcv_regression(X_train, X_test, y_train, y_test):
    elasticnet_cv = ElasticNetCV(alphas=None,cv=10, normalize=True)

    # Ridge Model Building..
    elasticnet_cv.fit(X_train,y_train)
    calc_scores(elasticnet_cv,X_train,y_train)
    pickle.dump(elasticnet_cv,open(models_folder+'/elasticnetcv_model.pkl','wb'))
    return elasticnet_cv

elasticnet_cv = train_elasticnetcv_regression(X_train, X_test, y_train, y_test)

R2 Score:  0.8540849306943286
Adj R2 Score:  0.8513018094814139


In [39]:
# This is Lambda, wrt the formula..
elasticnet_cv.alpha_

2.9174856844613893e-05

In [40]:
## Pass in the Alpha calcualted with ElasticNetCV to ElasticNet.. some random values..

elasticnet_lr = ElasticNet(alpha=elasticnet_cv.alpha_)
elasticnet_lr.fit(X_train, y_train)

ElasticNet(alpha=2.9174856844613893e-05)

In [41]:
elasticnet_cv.alpha_

2.9174856844613893e-05

In [42]:
elasticnet_cv.l1_ratio_

0.5

In [43]:
elasticnet_cv.score(X_test,y_test), elasticnet_lr.score(X_test,y_test)

(0.7259717206795369, 0.7267001169565483)

In [45]:
# Trying out Random Seeds for 1000 LR Models, and finding the Best Seed..

best_seed = -1
best_r2 = 0
for i in range(1,1000):
    X_train, X_test, y_train, y_test = train_test_split(X_st, y, test_size=0.25,random_state=i)
    

    lr = LinearRegression()
    lr_model = lr.fit(X_train, y_train)
    _, adj_R2 = calc_scores(lr_model,X_train,y_train,should_print=False)
    if adj_R2 > best_r2:
        best_r2 = adj_R2
        best_seed = i

#lr_model = train_linear_regression(X_train, X_test, y_train, y_test)
best_r2, best_seed

(0.8513292512719346, 63)

In [46]:
# Hence, I am setting the seed above.. to 63.

In [48]:
predictors = {
        'Linear Regression' : 'predict_linear_regression'
    }

for pred in predictors:
    print(pred)

Linear Regression


In [50]:
type(st_scaler)

sklearn.preprocessing._data.StandardScaler