## CRSIP - DM Model
- Barry - Business Understanding
- Drove - Data Understanding
- Directly to the - Data Prep
- Medical - Modelling
- Emergency - Evaluation
- Department - mDeployment

# Import Dependencies

In [1]:
import pandas as pd

# Understand Data 

In [2]:
data = pd.read_csv('data/admission_data.csv')

In [3]:
data.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [4]:
data.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [5]:
data.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

# Prepare Data 

In [6]:
X, y = data.drop('Chance of Admit ', axis=1), data['Chance of Admit ']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(350, 7) (350,) (150, 7) (150,)


# Modelling

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet  
# Linear models, l2 (sum of squares) regularization, l1 (sum of absolute values), and hybrid of both
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# https://www.reddit.com/r/explainlikeimfive/comments/86dx3u/eli5_what_is_gradient_boosting_and_how_can_it_be/
# explanation of gradient boosting regressor^

In [10]:
pipelines = {
    'rf':make_pipeline(RandomForestRegressor(random_state=1234)),
    'gb':make_pipeline(GradientBoostingRegressor(random_state=1234)),
    'ridge':make_pipeline(Ridge(random_state=1234)),
    'lasso':make_pipeline(Lasso(random_state=1234)),
    'enet':make_pipeline(ElasticNet(random_state=1234)),
}

In [11]:
hypergrid = {
    'rf': {
        'randomforestregressor__min_samples_split':[2,4,6],
        'randomforestregressor__min_samples_leaf':[1,2,3]
    },
    'gb':{
        'gradientboostingregressor__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'ridge':{
        'ridge__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'lasso':{
        'lasso__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'enet':{
        'elasticnet__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    }
}

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import NotFittedError

In [13]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hypergrid[algo], cv=10, n_jobs=-1)
    try:
        print('Starting training for {}.'.format(algo))
        model.fit(X_train, y_train)
        fit_models[algo] = model
        print('{} has been successfully fit.'.format(algo))
    except NotFittedError as e:
        print(repr(e))

Starting training for rf.
rf has been successfully fit.
Starting training for gb.
gb has been successfully fit.
Starting training for ridge.
ridge has been successfully fit.
Starting training for lasso.
lasso has been successfully fit.
Starting training for enet.
enet has been successfully fit.


In [14]:
# Uncomment to check if ridge model has been fitted
# fit_models['ridge'].predict(X_test)

array([0.65223972, 0.70914687, 0.93646764, 0.73097287, 0.81817603,
       0.66846207, 0.74709787, 0.71614672, 0.79207306, 0.66273398,
       0.66885908, 0.57296691, 0.79012587, 0.79249075, 0.77427696,
       0.86035964, 0.62901205, 0.75279118, 0.90144315, 0.67442468,
       0.62416451, 0.79312685, 0.83828386, 0.61303979, 0.78769448,
       0.57843385, 0.94750781, 0.64763533, 0.86151159, 0.71550847,
       0.63334844, 0.81233134, 0.59657572, 0.90851703, 0.51323589,
       0.81880756, 0.67820834, 0.6335817 , 0.65593919, 0.91062319,
       0.57269145, 0.66783624, 0.7768664 , 0.96879732, 0.7790146 ,
       0.52268402, 0.65673298, 0.63000797, 0.6688235 , 0.65506334,
       0.83419988, 0.91723722, 0.88218995, 0.63029818, 0.75245689,
       0.65030411, 0.75184312, 0.60775623, 0.6692565 , 0.70681976,
       0.44533147, 0.71458233, 0.74806121, 0.85045006, 0.97545193,
       0.61229821, 0.73832177, 0.78462003, 0.93090783, 0.70359343,
       0.6040648 , 0.65439139, 0.82797927, 0.5035707 , 0.92728

# Evaluate model

In [15]:
from sklearn.metrics import r2_score, mean_absolute_error

In [16]:
for algo,model in fit_models.items():
    yhat = model.predict(X_test)
    print('{} scores - R2:{} MAE:{}'.format(algo, r2_score(y_test, yhat), mean_absolute_error(y_test, yhat)))

rf scores - R2:0.7832866306894993 MAE:0.04655555715950716
gb scores - R2:0.7907019885390745 MAE:0.04670313403369685
ridge scores - R2:0.8147161994554437 MAE:0.04411075422775803
lasso scores - R2:0.8127276774033245 MAE:0.04415874576808856
enet scores - R2:0.8143199134729905 MAE:0.04406209475488235


In [17]:
best_model = fit_models['rf']

In [18]:
best_model.predict([X.iloc[0]])
# should be about 0.92

array([0.92980183])

# Deploy ML model 