In [11]:
import pandas as pd
import re
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score,make_scorer
from tqdm import tqdm
from sklearn import tree
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold
from scipy.stats import uniform, truncnorm, randint
from pprint import pprint
#from google.colab import drive
#drive.mount('/content/drive')
from dataset_tool import load_data
from sklearn import metrics   #Additional scklearn functions
import random
import time
import pandas as pd

# Data loading

In [2]:
x_path = "X_train_update.csv"
y_path = "Y_train_CVw08PX.csv"
X_train, X_test, y_train, y_test = load_data(x_path,y_path,test=True)
X,y = load_data(x_path,y_path,test=False)

In [3]:
weighted_f1_scores = {}

# Models

## Decision tree

Weigthed f1 score on test set : 0.69


In [0]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
weighted_f1_scores["decision tree"] = f1_score(y_test, y_pred,average="weighted")

# Bagging
Weighted f1 score : 0.70

In [0]:
clf_bag = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5)
clf_bag.fit(X_train, y_train)
y_pred_bag = clf_bag.predict(X_test)
weighted_f1_scores["bagging"]  = f1_score(y_test, y_pred_bag,average="weighted")

## Gradient boosting tree

### a) Baseline

Iter : 7 minutes to run

In [10]:
gbc = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=10,max_features='sqrt',subsample=0.8,random_state=10,verbose=True)

In [8]:
gbc.fit(X_train, y_train)
y_pred_gbc = gbc.predict(X_test)
weighted_f1_scores["boosting"]  = f1_score(y_test, y_pred_gbc,average="weighted")

      Iter       Train Loss      OOB Improve   Remaining Time 
         1      136544.2179         982.5173            6.54m
         2      132705.2660         885.7003            6.42m
         3      130068.9205         708.0142            6.43m
         4      127644.5382         594.5615            6.43m
         5      124965.6249         621.2197            6.40m
         6      122969.2057         574.1821            6.38m
         7      119124.9572         876.0935            6.31m
         8      117390.0324         413.0601            6.26m
         9      115424.9720         506.1931            6.19m
        10      113268.5318         525.8771            6.16m
        20      100237.0666         203.0858            5.40m
        30       92333.0935         141.2847            4.70m
        40       86687.8451          96.3985            4.03m
        50       81900.0499          98.1419            3.37m
        60       77825.2093          61.0605            2.70m
       

In [9]:
weighted_f1_scores["boosting"]

0.6179197924453074

In [96]:
f1_scorer = make_scorer(f1_score, average="weighted")

In [29]:
scores = cross_val_score(gbc, X, y, scoring = f1_scorer,cv=3)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1      136144.4947         982.8597            6.74m
         2      133649.8232         723.5428            6.64m
         3      130722.0561         742.7870            6.54m
         4      126298.6006        1119.4077            6.49m
         5      124415.2202         447.3637            6.44m
         6      121048.6235         795.8666            6.43m
         7      119274.3626         371.2028            6.32m
         8      117268.2982         510.5997            6.25m
         9      116040.9671         286.6780            6.22m
        10      114368.0907         413.4076            6.15m
        20      101001.9887         523.1510            5.42m
        30       93257.1978         108.9011            4.84m
        40       86892.0179         115.0370            4.20m
        50       82696.0146          77.4263            3.51m
        60       78780.6514          84.0524            2.81m
       

In [30]:
scores

array([0.62066014, 0.6175334 , 0.6242551 ])

### b) Tuning boosting hyperparameters 

In [4]:
model_params = {'n_estimators': randint(100,200), "learning_rate" : uniform(0.01, 0.199)} 
model = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,subsample=0.8,random_state=10,verbose=True)
r_search = RandomizedSearchCV(estimator = model,param_distributions = model_params, n_jobs=-1,  pre_dispatch= '2*n_jobs' , cv=3, verbose=True,n_iter=5) 

In [5]:
start = time.time()
r_search.fit(X_train,y_train)
stop = time.time()
print(f"Training time: {stop - start}s")

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 235.5min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1      110995.8701        7243.2755          205.71m
         2      103117.6127        1907.7917          202.58m
         3       97887.0584        1351.5067          200.89m
         4       93517.1399         965.7426          199.75m
         5       90126.2511         792.2066          198.34m
         6       87440.7455         666.9339          197.07m
         7       84791.1510         554.4446          195.75m
         8       82596.1967         490.3385          194.54m
         9       80780.7917         416.4464          193.22m
        10       79046.4584         371.8463          192.02m
        20       68281.8930         139.8262          179.38m
        30       62637.4320          63.9101          166.73m
        40       59039.0798          36.8356          154.18m
        50       56108.6427          17.4501          141.44m
        60       54176.7152          11.9764          128.86m
       

In [12]:
pd.DataFrame(r_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,5610.406626,26.873084,3.684092,0.028585,0.180633,107,"{'learning_rate': 0.1806325819190775, 'n_estim...",0.650142,0.647875,0.648718,0.648912,0.000936,5
1,6640.653801,34.179555,5.093846,0.046856,0.0904566,128,"{'learning_rate': 0.09045660203623399, 'n_esti...",0.662957,0.655574,0.65684,0.658457,0.003224,4
2,6017.950537,269.905329,4.231884,0.223173,0.114521,119,"{'learning_rate': 0.11452138622626946, 'n_esti...",0.662588,0.656418,0.65916,0.659389,0.002524,2
3,7796.468936,69.822222,5.370745,0.858437,0.0688627,167,"{'learning_rate': 0.06886266242371746, 'n_esti...",0.663116,0.656682,0.657051,0.658949,0.00295,3
4,7475.794405,31.856706,4.592971,0.214957,0.0808579,162,"{'learning_rate': 0.08085793474266192, 'n_esti...",0.664962,0.657156,0.659424,0.660514,0.003278,1


In [25]:
r_search.best_params_

{'learning_rate': 0.12276450976584319, 'n_estimators': 121}

In [95]:
scores = cross_val_score(gbc, X, y, scoring = f1_scorer,cv=3)

NameError: name 'f1_scorer' is not defined

### c) Tuning tree specific parameters

In [41]:
model = GradientBoostingClassifier(learning_rate=0.1, n_estimators = 100,min_samples_leaf=50,subsample=0.8,random_state=10,verbose=True)
param_test = {'max_depth':randint(5,16), 'min_samples_split':randint(200,1001)}
r_search_tree= RandomizedSearchCV(estimator = model,param_distributions = param_test, n_jobs=-1,  pre_dispatch= '2*n_jobs' , cv=3, verbose=True,n_iter=5)

In [42]:
start = time.time()
r_search_tree.fit(X_train,y_train)
stop = time.time()
print(f"Training time: {stop - start}s")

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 178.4min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1      102944.9009        9087.1616          125.94m
         2       94164.3826        2104.8068          124.98m
         3       88401.6044        1397.1894          123.87m
         4       83524.9291        1082.6160          122.47m
         5       79827.6292         819.5289          121.06m
         6       77012.5145         656.5506          119.72m
         7       74195.9714         553.8268          118.36m
         8       72002.2738         453.4588          117.01m
         9       70126.7686         395.7879          115.63m
        10       68305.5258         366.5776          114.32m
        20       57448.2771         110.4491          102.10m
        30       52262.5033          39.7527           90.39m
        40       48727.6977           8.2088           78.56m
        50       46179.6134          -0.7113           65.76m
        60       44409.0906         -11.7894           52.24m
       

In [47]:
pd.DataFrame(r_search_tree.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,5768.458154,34.195524,9.447692,0.266722,15,341,"{'max_depth': 15, 'min_samples_split': 341}",0.668548,0.660321,0.66301,0.663959,0.003425,1
1,5368.496081,29.46941,5.658803,0.066373,10,663,"{'max_depth': 10, 'min_samples_split': 663}",0.66454,0.65626,0.657631,0.659477,0.003623,2
2,5126.870524,150.440628,4.176739,1.080014,8,349,"{'max_depth': 8, 'min_samples_split': 349}",0.660953,0.653412,0.654045,0.656137,0.003416,4
3,4833.626263,20.549025,2.111294,0.365041,7,703,"{'max_depth': 7, 'min_samples_split': 703}",0.658686,0.652621,0.654256,0.655188,0.002562,5
4,4949.630785,31.709115,3.097252,0.122633,10,207,"{'max_depth': 10, 'min_samples_split': 207}",0.664909,0.655785,0.656049,0.658914,0.00424,3


In [79]:
model_tree = GradientBoostingClassifier(learning_rate=0.1, n_estimators = 100,random_state=10, max_depth=8,verbose=True)
param_test = {'min_samples_split':randint(200,1001),'min_samples_leaf':randint(30,71),'subsample':truncnorm(a=0, b=1, loc=0.8, scale=0.1)}  

In [80]:
r_search_tree_bis= RandomizedSearchCV(estimator = model,param_distributions = param_test, n_jobs=-1,  pre_dispatch= '2*n_jobs' , cv=3, verbose=True,n_iter=5)

In [81]:
start = time.time()
r_search_tree_bis.fit(X_train,y_train)
stop = time.time()
print(f"Training time: {stop - start}s")

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 166.0min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1      123695.7886        4235.5435          126.53m
         2      116212.7682        1024.4179          124.08m
         3      111483.5591         658.8945          123.71m
         4      107321.4773         527.0135          123.27m
         5      104469.6775         392.7902          121.77m
         6      101941.7606         338.7061          120.06m
         7       99680.3484         257.5629          117.91m
         8       97651.9023         284.0948          115.86m
         9       95872.8344         201.1784          114.32m
        10       94500.8808         179.2540          112.93m
        20       84118.6883          69.6106           99.91m
        30       78002.1554          35.7971           87.24m
        40       73840.7310          18.5011           74.03m
        50       70500.7485          11.8988           61.38m
        60       68015.0150          12.3255           49.02m
       

In [88]:
pd.DataFrame(r_search_tree_bis.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,5044.89838,28.580103,1.257741,0.044316,61,518,0.85168,"{'min_samples_leaf': 61, 'min_samples_split': ...",0.630155,0.623932,0.626991,0.627026,0.002541,4
1,4813.763343,25.209216,1.250001,0.020653,39,949,0.808148,"{'min_samples_leaf': 39, 'min_samples_split': ...",0.651092,0.646398,0.645449,0.647646,0.002467,2
2,5045.131357,201.449061,1.104176,0.160652,31,525,0.874985,"{'min_samples_leaf': 31, 'min_samples_split': ...",0.663168,0.662008,0.656682,0.660619,0.002824,1
3,4572.273954,28.102625,1.038868,0.260236,55,836,0.837003,"{'min_samples_leaf': 55, 'min_samples_split': ...",0.632897,0.626674,0.632265,0.630612,0.002796,3
4,4827.227201,23.882941,0.790777,0.022591,67,302,0.89906,"{'min_samples_leaf': 67, 'min_samples_split': ...",0.62467,0.620926,0.622455,0.622684,0.001537,5


In [98]:
gbc = GradientBoostingClassifier(n_estimators=162,learning_rate=0.08, min_samples_split=341,min_samples_leaf=31,max_depth=15,subsample=0.87,random_state=10,verbose=True)

In [97]:
gbc.fit(X_train, y_train)
y_pred_gbc = gbc.predict(X_test)
weighted_f1_scores["boosting"]  = f1_score(y_test, y_pred_gbc,average="weighted")

      Iter       Train Loss      OOB Improve   Remaining Time 


KeyboardInterrupt: 

In [94]:
weighted_f1_scores

{'boosting': 0.7376305456875618}

In [None]:
scores = cross_val_score(gbc, X, y, scoring = f1_scorer,cv=3)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1      114996.2461        5562.8743          249.50m
         2      105083.0090        1354.9459          252.71m
         3       98368.5418         948.9277          253.23m
         4       93144.5014         697.6228          250.95m
         5       88977.7362         561.5556          244.38m
         6       85400.9158         463.3452          239.18m
         7       82313.7831         385.0605          235.52m
         8       79639.8159         324.6336          232.39m
         9       77467.8242         270.0482          233.05m
        10       75625.7473         233.4114          230.24m
        20       62667.5058          74.6625          210.82m
        30       56169.1466          29.9089          199.85m
        40       51987.6145           9.4342          183.59m
        50       48872.1068          -1.7125          167.49m
        60       46586.2486         -23.0695          153.15m
       

## Random forest
F1 weigted score : 0.73 (without grid search)


In [0]:
model_params = {
    'n_estimators': randint(50,200),
    'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
    'min_samples_split': uniform(0.01, 0.199)
}

In [0]:
rf_model = RandomForestClassifier()
clf_rf = RandomizedSearchCV(rf_model, model_params, n_iter=10, cv=5, random_state=1,n_jobs = -1)
model = clf_rf.fit(X_train,y_train)

KeyboardInterrupt: ignored

In [0]:
pprint(model.best_estimator_.get_params())

In [0]:
y_pred_rf = clf_rf.predict(X_test)
weighted_f1_scores["random forest classifier"]  = f1_score(y_test, y_pred_rf,average="weighted")

##Adaboost classifier 

In [0]:
estimators = 70
alpha = 0.01
random = None
model = AdaBoostClassifier(n_estimators = estimators, learning_rate = alpha, random_state = random )

In [0]:
res = model.fit(X_train, y_train)

In [0]:
model.score(X_train, y_train)

In [0]:
y_pred = model.predict(X_test)
f1_score(y_test, y_pred,average="weighted")

In [0]:
param_dist = {
 'n_estimators': randint(100,200),
 'learning_rate' : [0.1,0.3,1] }
pre_gs_inst = RandomizedSearchCV(AdaBoostClassifier(),
 param_distributions = param_dist,
 cv=3,
 n_iter = 5,
 n_jobs=-1)

In [0]:
pre_gs_inst.fit(X_train, y_train)

KeyboardInterrupt: ignored