In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
plt.rcParams['figure.figsize'] = [15,8]
pd.set_option('display.max_columns',None)
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
# to perform Z-test
import statsmodels.stats.weightstats as stests
import statsmodels.stats.multicomp as mc
from sklearn.preprocessing import StandardScaler
# fit transform will accept only df : df1[['total_bill']] 
from sklearn.preprocessing import PowerTransformer
# machine learning
import statsmodels.formula.api as sfa
import statsmodels.api as sma
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
# classification
from sklearn.metrics import accuracy_score,recall_score,precision_score,confusion_matrix,classification_report,log_loss,roc_curve,roc_auc_score,cohen_kappa_score

In [2]:
df=pd.read_csv("Admission_predict.csv")
df.head(2)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,1
1,2,324,107,4,4.0,4.5,8.87,1,1


In [3]:
df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR', 'CGPA', 'Research', 'Chance of Admit'],
      dtype='object')

In [7]:
df=df.drop('Serial No.',axis=1)

In [8]:
df.head(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1


In [9]:
# for knn we must scale data....but this file is for all the algorithms hence some algo(naive) data shd not be scaled...so different variables

In [10]:
# y-->chance of Admit
# x-->other variables

In [11]:
x=df.drop('Chance of Admit',axis=1)
y=df['Chance of Admit']

In [12]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,random_state=10,test_size=0.2)

In [13]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(xtrain,ytrain)
ypred_lr=lr.predict(xtest)

In [14]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
print("Accuracy:",accuracy_score(ytest,ypred_lr))
print(classification_report(ytest,ypred_lr))

Accuracy: 0.775
              precision    recall  f1-score   support

           0       0.80      0.76      0.78        42
           1       0.75      0.79      0.77        38

    accuracy                           0.78        80
   macro avg       0.78      0.78      0.77        80
weighted avg       0.78      0.78      0.78        80



In [15]:
# scaling-->for KNN
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
xtrain_sc=sc.fit_transform(xtrain)
xtest_sc=sc.transform(xtest)

In [16]:
### KNN

from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(xtrain_sc,ytrain)
ypred_knn=knn.predict(xtest_sc)  # passed scaled data

print("Accuracy:",accuracy_score(ytest,ypred_knn))
print(classification_report(ytest,ypred_knn))

Accuracy: 0.8375
              precision    recall  f1-score   support

           0       0.87      0.81      0.84        42
           1       0.80      0.87      0.84        38

    accuracy                           0.84        80
   macro avg       0.84      0.84      0.84        80
weighted avg       0.84      0.84      0.84        80



In [17]:
### Naive Bayes
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
gnb=GaussianNB()
gnb.fit(xtrain,ytrain)
ypred_gnb=gnb.predict(xtest)

print("Accuracy:",accuracy_score(ytest,ypred_gnb))
print(classification_report(ytest,ypred_gnb))

Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.89      0.81      0.85        42
           1       0.81      0.89      0.85        38

    accuracy                           0.85        80
   macro avg       0.85      0.85      0.85        80
weighted avg       0.85      0.85      0.85        80



In [18]:
bnb=BernoulliNB()
bnb.fit(xtrain,ytrain)
ypred_bnb=bnb.predict(xtest)

print("Accuracy:",accuracy_score(ytest,ypred_bnb))
print(classification_report(ytest,ypred_bnb))

Accuracy: 0.7375
              precision    recall  f1-score   support

           0       0.80      0.67      0.73        42
           1       0.69      0.82      0.75        38

    accuracy                           0.74        80
   macro avg       0.74      0.74      0.74        80
weighted avg       0.75      0.74      0.74        80



In [19]:
mnb=MultinomialNB()
mnb.fit(xtrain,ytrain)
ypred_mnb=mnb.predict(xtest)

print("Accuracy:",accuracy_score(ytest,ypred_mnb))
print(classification_report(ytest,ypred_mnb))

Accuracy: 0.8125
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        42
           1       0.79      0.82      0.81        38

    accuracy                           0.81        80
   macro avg       0.81      0.81      0.81        80
weighted avg       0.81      0.81      0.81        80



In [20]:
# finding best performing hyper-parameters by passing it to the grid search cv

from sklearn.model_selection import GridSearchCV

parameters={'n_neighbors':[3,4,5,6],'p':[1,2]}
knn11=KNeighborsClassifier()
knncv=GridSearchCV(knn11,parameters,cv=5,scoring="accuracy")
knncv.fit(xtrain_sc,ytrain)
knncv.best_params_

# for this best performing hyper-parameters for this training data is n=6 and p=1

{'n_neighbors': 6, 'p': 1}

In [21]:
knn2=KNeighborsClassifier(n_neighbors=6,p=1)
knn2.fit(xtrain_sc,ytrain)
ypred_knn2=knn2.predict(xtest_sc)  # passed scaled data

print("Accuracy:",accuracy_score(ytest,ypred_knn2))
print(classification_report(ytest,ypred_knn2))

# improved....accuracy from 83% to 86%

Accuracy: 0.8625
              precision    recall  f1-score   support

           0       0.86      0.88      0.87        42
           1       0.86      0.84      0.85        38

    accuracy                           0.86        80
   macro avg       0.86      0.86      0.86        80
weighted avg       0.86      0.86      0.86        80



In [22]:
## GridsearchCV
    # used to get the best hyper parameters to predict a model
    # can also do ridge,lasso and elasticNet

In [23]:
### DECISION TREE
# not necessarily need scaled data


In [24]:
##1. GINI method
from sklearn.tree import DecisionTreeClassifier
dtg=DecisionTreeClassifier(random_state=10)
dtg.fit(xtrain,ytrain)
ypred_dtg=dtg.predict(xtest)

In [25]:
print(confusion_matrix(ytest,ypred_dtg))
print("accuracy_score:",accuracy_score(ytest,ypred_dtg))
print(classification_report(ytest,ypred_dtg))

[[33  9]
 [ 7 31]]
accuracy_score: 0.8
              precision    recall  f1-score   support

           0       0.82      0.79      0.80        42
           1       0.78      0.82      0.79        38

    accuracy                           0.80        80
   macro avg       0.80      0.80      0.80        80
weighted avg       0.80      0.80      0.80        80



In [26]:
## Score function
print("Training accuracy:",dtg.score(xtrain,ytrain))
print("Test accuracy:",dtg.score(xtest,ytest))

Training accuracy: 1.0
Test accuracy: 0.8


In [27]:
##2. entropy method
from sklearn.tree import DecisionTreeClassifier
dte=DecisionTreeClassifier(criterion='entropy',random_state=10)
dte.fit(xtrain,ytrain)
ypred_dte=dte.predict(xtest)

In [28]:
print(confusion_matrix(ytest,ypred_dte))
print("accuracy_score:",accuracy_score(ytest,ypred_dte))
print(classification_report(ytest,ypred_dte))

[[33  9]
 [ 7 31]]
accuracy_score: 0.8
              precision    recall  f1-score   support

           0       0.82      0.79      0.80        42
           1       0.78      0.82      0.79        38

    accuracy                           0.80        80
   macro avg       0.80      0.80      0.80        80
weighted avg       0.80      0.80      0.80        80



In [29]:
## Score function
print("Training accuracy:",dte.score(xtrain,ytrain))
print("Test accuracy:",dte.score(xtest,ytest))

Training accuracy: 1.0
Test accuracy: 0.8


In [30]:
# entropy manual calculation
p_red=0.5
p_blue=0.5
print("Entropy:",-1*((p_red*np.log2(p_red))+(p_blue*np.log2(p_blue))))

Entropy: 1.0


In [31]:
p_red=0.4
p_blue=0.6
print("Entropy:",-1*((p_red*np.log2(p_red))+(p_blue*np.log2(p_blue))))

Entropy: 0.9709505944546686


In [32]:
e_info=1-0.970
print("Entropy information:",e_info)

Entropy information: 0.030000000000000027


In [33]:
from sklearn import tree
feature=xtrain.columns

In [None]:
# gini tree
plt.figure(figsize=(20,20))
tree.plot_tree(dtg,feature_names=feature,filled=True,rounded=True)
plt.show()

In [None]:
# entropy tree
plt.figure(figsize=(20,20))
tree.plot_tree(dte,feature_names=feature,filled=True,rounded=True)
plt.show()

In [None]:
dt5=DecisionTreeClassifier(max_depth=5,random_state=10)   #excluding root node will go for 5 level
dt5.fit(xtrain,ytrain)
ypred_dt5=dt5.predict(xtest)
plt.figure(figsize=(20,20))
tree.plot_tree(dt5,feature_names=feature,filled=True,rounded=True)
plt.show()

In [37]:
print("Training accuracy:",dt5.score(xtrain,ytrain))
print("Test accuracy:",dt5.score(xtest,ytest))

Training accuracy: 0.93125
Test accuracy: 0.8


In [38]:
# parameters to build the model on
parameters={'max_depth':[5,6,7],'min_samples_leaf':[10,20],'criterion':['gini','entropy']}
dt11=DecisionTreeClassifier()
dt11cv=GridSearchCV(dt11,parameters,cv=5,scoring='accuracy')
dt11cv.fit(xtrain,ytrain)
print(dt11cv.best_params_)
print(dt11cv.best_score_)

{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 20}
0.859375


In [39]:
dt11=DecisionTreeClassifier(max_depth=5,random_state=10,min_samples_leaf=20,criterion='gini')   #excluding root node will go for 5 level
dt11.fit(xtrain,ytrain)
ypred_dt11=dt11.predict(xtest)
print(confusion_matrix(ytest,ypred_dt11))
print("accuracy_score:",accuracy_score(ytest,ypred_dt11))
print(classification_report(ytest,ypred_dt11))

[[37  5]
 [ 7 31]]
accuracy_score: 0.85
              precision    recall  f1-score   support

           0       0.84      0.88      0.86        42
           1       0.86      0.82      0.84        38

    accuracy                           0.85        80
   macro avg       0.85      0.85      0.85        80
weighted avg       0.85      0.85      0.85        80



In [None]:
ypred_prob_dt=dt.predict_proba(xtest)

In [None]:
ypred_prob_dt[10:20]

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
y_pred_prob_gnb=gnb.predict_proba(xtest)[:,1]   #first column or positive class
fpr,tpr,threshold=roc_curve(ytest,y_pred_prob_gnb)  
print(roc_auc_score(ytest,y_pred_prob_gnb)) 
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],'r--')
plt.show()


In [None]:
### Feature selection/importance

dtg.feature_importances_

In [None]:
xtrain.columns  #research not contributing

In [None]:
features_df=pd.DataFrame()
features_df['Feature']=xtrain.columns
features_df['Importances']=dtg.feature_importances_
features_df

In [None]:
features_df=features_df.sort_values('Importances',ascending=False)

In [None]:
sns.barplot(x='Importances',y='Feature',data=features_df)
plt.show()

In [None]:
### Ensemble 
## Random Forest

from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(random_state=10)
rf.fit(xtrain,ytrain)
ypred_rf=rf.predict(xtest)
print(confusion_matrix(ytest,ypred_rf))
print("accuracy_score:",accuracy_score(ytest,ypred_rf))
print(classification_report(ytest,ypred_rf))

print("Training accuracy:",rf.score(xtrain,ytrain))
print("Test accuracy:",rf.score(xtest,ytest))

rf.feature_importances_

In [None]:
# findding best parameters to build model
parameters={'n_estimators':[10,20],
            'max_depth':[5,6,7],
            'criterion':['gini','entropy']}
rf11=RandomForestClassifier(random_state=10)
rf_cv=GridSearchCV(rf11,parameters,cv=3,scoring="accuracy")
rf_cv.fit(xtrain,ytrain)
print(rf_cv.best_params_)
print(rf_cv.best_score_)


In [None]:
rf2=RandomForestClassifier(criterion='gini',max_depth=6,n_estimators=20,random_state=10)
rf2.fit(xtrain,ytrain)
ypred_rf2=rf2.predict(xtest)
print(confusion_matrix(ytest,ypred_rf2))
print("accuracy_score:",accuracy_score(ytest,ypred_rf2))
print(classification_report(ytest,ypred_rf2))

print("Training accuracy:",rf2.score(xtrain,ytrain))
print("Test accuracy:",rf2.score(xtest,ytest))

rf2.feature_importances_

In [None]:
## oob score parameter
rf3=RandomForestClassifier(oob_score=True,criterion='gini',max_depth=6,n_estimators=20,random_state=10)
rf3.fit(xtrain,ytrain)
ypred_rf3=rf3.predict(xtest)
print(confusion_matrix(ytest,ypred_rf3))
print("accuracy_score:",accuracy_score(ytest,ypred_rf3))
print(classification_report(ytest,ypred_rf3))
rf3.oob_score_
print("Training accuracy:",rf3.score(xtrain,ytrain))
print("Test accuracy:",rf3.score(xtest,ytest))

In [None]:
## Bagging classifier
from sklearn.ensemble import BaggingClassifier
bc=BaggingClassifier(random_state=10)
bc.fit(xtrain,ytrain)
ypred_bc=bc.predict(xtest)
print("accuracy_score:",accuracy_score(ytest,ypred_bc))
print(confusion_matrix(ytest,ypred_bc))
print(classification_report(ytest,ypred_bc))

In [None]:
knn=KNeighborsClassifier()
bag_knn=BaggingClassifier(estimator=knn,n_estimators=20,random_state=10)
bag_knn.fit(xtrain,ytrain)
ypred_bag_knn=bag_knn.predict(xtest)
print("accuracy_score:",accuracy_score(ytest,ypred_bag_knn))
print(confusion_matrix(ytest,ypred_bag_knn))

In [None]:
## cross value score
from sklearn.model_selection import cross_val_score
scores=cross_val_score(rf,xtrain,ytrain,cv=5,scoring='accuracy')
np.mean(scores)  #mean accuracy

In [None]:
### Boosting
## Ada boost

from sklearn.ensemble import AdaBoostClassifier
abcl=AdaBoostClassifier(random_state=10)  #default n_estimators=50 (50 models)
abcl.fit(xtrain,ytrain)
ypred_abcl=abcl.predict(xtest)

In [None]:
print(confusion_matrix(ytest,ypred_abcl))
print(accuracy_score(ytest,ypred_abcl))

In [None]:
print('Training accuracy :' ,abcl.score(xtrain,ytrain))
print('Test accuracy :' ,abcl.score(xtest,ytest))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abcl=AdaBoostClassifier(n_estimators=10,random_state=10)   # 10 models(stumps)
abcl.fit(xtrain,ytrain)
ypred_abcl=abcl.predict(xtest)

In [None]:
print(confusion_matrix(ytest,ypred_abcl))
print(accuracy_score(ytest,ypred_abcl))

In [None]:
print('Training accuracy :' ,abcl.score(xtrain,ytrain))
print('Test accuracy :' ,abcl.score(xtest,ytest))

In [None]:
### Gradient boosting

from sklearn.ensemble import GradientBoostingClassifier
gbcl=GradientBoostingClassifier(n_estimators=30,random_state=10)   # 10 models(stumps)
gbcl.fit(xtrain,ytrain)
ypred_gbcl=gbcl.predict(xtest)

In [None]:
print('Training accuracy :' ,gbcl.score(xtrain,ytrain))
print('Test accuracy :' ,gbcl.score(xtest,ytest))

In [None]:
### XG Boost

!pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier(n_estimators=2,random_state=10)
gb.fit(xtrain,ytrain)
ypred_gb=gb.predict(xtest)
print('confusion_mat:\n',confusion_matrix(ytest,ypred_gb))
print('accuracy_score:',accuracy_score(ytest,ypred_gb))
print('  ')
print(classification_report(ytest,ypred_gb))
print('train accuracy:',gb.score(xtrain,ytrain))
print('test accuracy:',gb.score(xtest,ytest))

In [None]:
### XGBoost
### XGBClassifier

In [None]:
from xgboost import XGBClassifier

In [44]:
xgb_model=XGBClassifier(random_state=10)
xgb_model.fit(xtrain,ytrain)
ypred_xgb=xgb_model.predict(xtest)

In [45]:
print("Train accuracy:",xgb_model.score(xtrain,ytrain))
print("Test accuracy:",xgb_model.score(xtest,ytest))


Train accuracy: 1.0
Test accuracy: 0.8125


In [46]:
parameters={'learning_rate':[0.1,0.2],
            'max_depth':[3,4,5],
            'gamma':[0,1,2]}
xgb1=XGBClassifier()#random_state=10)
xgb_cv=GridSearchCV(xgb1,parameters,cv=3,scoring='accuracy')
xgb_cv.fit(xtrain,ytrain)
print(xgb_cv.best_params_)
print(xgb_cv.best_score_)

{'gamma': 1, 'learning_rate': 0.1, 'max_depth': 3}
0.8844413095867866


In [48]:
xgb_model2=XGBClassifier(max_depth=3,random_state=10,gamma=1,learning_rate=0.1)
xgb_model2.fit(xtrain,ytrain)
ypred_xgb2=xgb_model2.predict(xtest)
print("Train accuracy:",xgb_model2.score(xtrain,ytrain))
print("Test accuracy:",xgb_model2.score(xtest,ytest))


Train accuracy: 0.915625
Test accuracy: 0.8375


In [49]:
### Stack

from sklearn.ensemble import StackingClassifier

In [50]:
lr=LogisticRegression()
gnb=GaussianNB()
dt=DecisionTreeClassifier(random_state=10)
ab=AdaBoostClassifier()

In [56]:
# any algorithms can be used
base_learners=[('lr_model',lr),
               ('KNN_model',knn),
               ('DT_model',dt)]
              #'ABCL_model',ab)]

In [53]:
stack=StackingClassifier(base_learners,final_estimator=GaussianNB())   # final estimator can be any model
stack.fit(xtrain,ytrain)
ypred_stack=stack.predict(xtest)
print("Train accuracy:",xgb_model2.score(xtrain,ytrain))
print("Test accuracy:",xgb_model2.score(xtest,ytest))

Train accuracy: 0.915625
Test accuracy: 0.8375


In [None]:
    ### Case study

In [57]:
#wrong

# # create an empty dataframe to store the scores for various algorithms
# score_card = pd.DataFrame(columns=['Model Name', 'Train Accuracy Score','Test Accuracy Score','f1-micro',
#                                        'f1-macro', 'f1-weighted'])

# # append the result table for all performance scores
# # performance measures considered for model comparision are 'AUC Score', 'Recall Score','Accuracy Score',
# # 'Kappa Score','f1-score' and roc auc score
# # compile the required information in a user defined function 
# def update_score_card(model_name,model,X_train,X_test,y_train,y_test):
#     from sklearn import metrics
#     y_pred_train = model.predict(X_train)
#     # let 'y_pred' be the predicted values of y
#     y_pred = model.predict(X_test)
    
#     # assign 'score_card' as global variable
#     global score_card

#     # append the results to the dataframe 'score_card'
#     # 'ignore_index = True' do not consider the index labels
#     score_card = score_card.append({'Model Name':model_name,
#                                     'Train Accuracy Score': metrics.accuracy_score(y_train, y_pred_train),
#                                     'Test Accuracy Score': metrics.accuracy_score(y_test, y_pred),
#                                     'f1-micro':metrics.f1_score(y_test,y_pred,average='micro'),
#                                     'f1-macro':metrics.f1_score(y_test,y_pred,average='macro'),
#                                     'f1-weighted':metrics.f1_score(y_test,y_pred,average='weighted')}, 
#                                     ignore_index = True)
#     return score_card

In [58]:
def Classification_report_method(model,X_train,y_train,X_test,y_test):
    # model get predicted value for y_train
    y_train_pred = model.predict(X_train)
    
    # model get predicted value for y_train
    y_test_pred = model.predict(X_test)
    print('Classification Report for Train Data:')
    # classification report
    print(classification_report(y_train,y_train_pred))
    print()
    print('Classification Report for Test Data:')
    # classification report
    print(classification_report(y_test,y_test_pred))

In [72]:
# create an empty dataframe to store the scores for various algorithms
score_card = pd.DataFrame(columns=['Model Name', 'Train Accuracy Score','Test Accuracy Score','Train F1 Score','Test F1 Score','f1-micro',
                                       'f1-macro', 'f1-weighted'])

# append the result table for all performance scores
# performance measures considered for model comparision are 'AUC Score', 'Recall Score','Accuracy Score',
# 'Kappa Score','f1-score' and roc auc score
# compile the required information in a user defined function 
def update_score_card(model_name,model,X_train,X_test,y_train,y_test):
    from sklearn import metrics
    y_pred_train = model.predict(X_train)
    # let 'y_pred' be the predicted values of y
    y_pred = model.predict(X_test)
    
    # assign 'score_card' as global variable
    global score_card

    # append the results to the dataframe 'score_card'
    # 'ignore_index = True' do not consider the index labels
    score_card=pd.concat([score_card,pd.DataFrame({'Model Name':[model_name],
                                    'Train Accuracy Score': [metrics.accuracy_score(y_train, y_pred_train)],
                                    'Test Accuracy Score': [metrics.accuracy_score(y_test, y_pred)],
                                     'Train F1 Score': [metrics.f1_score(y_train, y_pred_train)],
                                    'Test F1 Score': [metrics.f1_score(y_test, y_pred)],
                                    'f1-micro':[metrics.f1_score(y_test,y_pred,average='micro')],
                                    'f1-macro':[metrics.f1_score(y_test,y_pred,average='macro')],
                                    'f1-weighted':[metrics.f1_score(y_test,y_pred,average='weighted')]})])
    score_card=score_card.reset_index(drop=True)
    return score_card

In [73]:
lr=LogisticRegression()
lr.fit(xtrain,ytrain)
update_score_card("Logistic Regression",lr,xtrain,xtest,ytrain,ytest)

Unnamed: 0,Model Name,Train Accuracy Score,Test Accuracy Score,Train F1 Score,Test F1 Score,f1-micro,f1-macro,f1-weighted
0,Logistic Regression,0.8625,0.775,0.841727,0.769231,0.775,0.774859,0.775141


In [74]:
dt=DecisionTreeClassifier(random_state=10)
dt.fit(xtrain,ytrain)
update_score_card("Decision Tree",dt,xtrain,xtest,ytrain,ytest)

Unnamed: 0,Model Name,Train Accuracy Score,Test Accuracy Score,Train F1 Score,Test F1 Score,f1-micro,f1-macro,f1-weighted
0,Logistic Regression,0.8625,0.775,0.841727,0.769231,0.775,0.774859,0.775141
1,Decision Tree,1.0,0.8,1.0,0.794872,0.8,0.799875,0.800125


In [75]:
gnc=GaussianNB()
gnb.fit(xtrain,ytrain)
update_score_card("Gaussian NB",gnb,xtrain,xtest,ytrain,ytest)

Unnamed: 0,Model Name,Train Accuracy Score,Test Accuracy Score,Train F1 Score,Test F1 Score,f1-micro,f1-macro,f1-weighted
0,Logistic Regression,0.8625,0.775,0.841727,0.769231,0.775,0.774859,0.775141
1,Decision Tree,1.0,0.8,1.0,0.794872,0.8,0.799875,0.800125
2,Gaussian NB,0.88125,0.85,0.863309,0.85,0.85,0.85,0.85


In [76]:
dt5=DecisionTreeClassifier(max_depth=5,random_state=10)
dt5.fit(xtrain,ytrain)
update_score_card("Decision tree-tuned",dt5,xtrain,xtest,ytrain,ytest)

# gaussian nb is by far the best model

Unnamed: 0,Model Name,Train Accuracy Score,Test Accuracy Score,Train F1 Score,Test F1 Score,f1-micro,f1-macro,f1-weighted
0,Logistic Regression,0.8625,0.775,0.841727,0.769231,0.775,0.774859,0.775141
1,Decision Tree,1.0,0.8,1.0,0.794872,0.8,0.799875,0.800125
2,Gaussian NB,0.88125,0.85,0.863309,0.85,0.85,0.85,0.85
3,Decision tree-tuned,0.93125,0.8,0.920863,0.794872,0.8,0.799875,0.800125


In [78]:
from sklearn.ensemble  import AdaBoostClassifier
abcl=AdaBoostClassifier()
abcl.fit(xtrain,ytrain)
update_score_card("Ada boost",abcl,xtrain,xtest,ytrain,ytest)

Unnamed: 0,Model Name,Train Accuracy Score,Test Accuracy Score,Train F1 Score,Test F1 Score,f1-micro,f1-macro,f1-weighted
0,Logistic Regression,0.8625,0.775,0.841727,0.769231,0.775,0.774859,0.775141
1,Decision Tree,1.0,0.8,1.0,0.794872,0.8,0.799875,0.800125
2,Gaussian NB,0.88125,0.85,0.863309,0.85,0.85,0.85,0.85
3,Decision tree-tuned,0.93125,0.8,0.920863,0.794872,0.8,0.799875,0.800125
4,Ada boost,0.91875,0.8125,0.908451,0.810127,0.8125,0.812471,0.812588
