In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier 



# I will use cross_val_score on XGBoost to select 100,200 or 300 for each preprocess

In [2]:
work_train_w2v = {} 
work_test_w2v = {}
pre_process=["w2v_100.csv","w2v_200.csv","w2v_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_train_"+f)
    work_test_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_test_"+f)

In [3]:
work_train_tfidf = {} 
work_test_tfidf = {}
pre_process=["tfidf_tsvd_100.csv","tfidf_tsvd_200.csv","tfidf_tsvd_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_train_"+f)
    work_test_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_test_"+f)

In [None]:
#we keep the lemmatization bio w2v
work_train_bio = {} 
work_test_bio = {}
pre_process=["bio.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_bio[re.sub("\.csv","",f)] = pd.read_csv(path+"new_working_train_"+f)
    work_test_bio[re.sub("\.csv","",f)] = pd.read_csv(path+"new_working_test_"+f)

In [2]:
new_train=pd.read_csv("checkpoints_databases/new_working_train.csv")
new_test=pd.read_csv("checkpoints_databases/new_working_test.csv")

In [3]:
y=np.array(new_train.Class)-1

In [4]:
ID_train=new_train.ID
ID_test=new_test.ID


In [5]:
kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)

Start 

In [8]:
clf_xgb=XGBClassifier(max_depth=5, objective="multi:softprob",seed=26)
for name in work_train_w2v:
    h=cross_val_score(clf_xgb,np.array(work_train_w2v[name]),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))
#

meanw2v_100 -1.08591915345 std:w2v_100 0.0541873570702
meanw2v_200 -1.07376934966 std:w2v_200 0.0372816273367
meanw2v_300 -1.06437402745 std:w2v_300 0.0367764336005


In [9]:
for name in work_train_tfidf:
    h=cross_val_score(clf_xgb,np.array(work_train_tfidf[name]),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))
#

meantfidf_tsvd_100 -1.0169704023 std:tfidf_tsvd_100 0.0365495778442
meantfidf_tsvd_200 -1.02394548835 std:tfidf_tsvd_200 0.0384426689575
meantfidf_tsvd_300 -1.02594316006 std:tfidf_tsvd_300 0.0390292436386


# GRID SEARCH PHASE ALL ALGOS

In [7]:
work_train= {} 
work_test = {}
#To complete
path="checkpoints_databases/"
work_train["bio"] = pd.read_csv(path+"new_working_train_bio.csv")
work_test["bio"] = pd.read_csv(path+"new_working_test_bio.csv")
work_train["tfidf_100"] = pd.read_csv(path+"new_working_train_tfidf_tsvd_100.csv")
work_test["tfidf_100"] = pd.read_csv(path+"new_working_test_tfidf_tsvd_100.csv")
work_train["w2v_300"] = pd.read_csv(path+"new_working_train_w2v_300.csv")
work_test["w2v_300"] = pd.read_csv(path+"new_working_test_w2v_300.csv")

In [8]:
feat_train=pd.read_csv("../window/w_meta_features/meta_train_l1l2.csv")
feat_test=pd.read_csv("../window/w_meta_features/meta_test_l1l2.csv")
feat_train=feat_train.drop("ID",axis=1)
feat_test=feat_test.drop("ID",axis=1)
feat_ext_train=pd.read_csv("w_meta_features/new_working_train_ext.csv")
feat_ext_test=pd.read_csv("w_meta_features/new_working_test_ext.csv")

In [9]:
#GO features + text embeddings
work_train_final={}
work_test_final={}
for name in work_train:
    work_train_final[name]=pd.concat((work_train[name],feat_ext_train),axis=1)
    work_test_final[name]=pd.concat((work_test[name],feat_ext_test),axis=1)

# the gridsearch for w2vbio lemmatized is already done in the other modelling script

In [None]:
clf_xgb=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5,7],
    "min_child_weight" : [1,3,5],
    "n_estimators" : [100,200],
    "subsample":[0.8,0.9,1],
    "colsample_bytree":[0.8,0.9,1]
}
for name in ["tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_xgb, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio : {'colsample_bytree': 0.8, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.9} : 0.9385
#tfidf: 
#w2v: 

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.5min


In [16]:
clf_lgbm=LGBMClassifier(seed=26)
param_test= {
    'n_estimators': [8,24,36,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'colsample_bytree' : [0.7,0.8,0.9],
    'subsample' : [0.7,0.8,0.9]
    }
for name in ["bio","tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_lgbm, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio :{'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.9}
#-0.950704488318
#tfidf : {'boosting_type': 'gbdt', 'colsample_bytree': 0.9, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#-0.93280968028
#w2v : {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.9}
#-0.948655365308

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 22.0min finished


bio
[mean: -1.58538, std: 0.02208, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.58567, std: 0.02589, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.58347, std: 0.02348, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.9}, mean: -1.51911, std: 0.02147, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.51401, std: 0.02102, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.50926, std: 0.02076, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.9}, mean: -1.50056, std: 0.01875, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 's

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 30.7min finished


w2v_300
[mean: -1.57031, std: 0.01970, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.57253, std: 0.02050, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.56997, std: 0.01556, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.9}, mean: -1.50408, std: 0.02095, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.50222, std: 0.01758, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.49851, std: 0.01856, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.9}, mean: -1.48358, std: 0.01361, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16



In [15]:
clf_log=LogisticRegression()
param_test= {
    "C" : [5,7,9,10,12,14,16,18,20],
    "penalty" : ["l1","l2"]
}
for name in ["tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_log, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio : {'C': 10, 'penalty': 'l2'}
#tfidf : {'C': 9, 'penalty': 'l2'}
#w2v : {'C': 5, 'penalty': 'l2'}

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  4.8min finished


tfidf_100
[mean: -1.09834, std: 0.03969, params: {'C': 5, 'penalty': 'l1'}, mean: -1.09634, std: 0.04143, params: {'C': 5, 'penalty': 'l2'}, mean: -1.10748, std: 0.04719, params: {'C': 7, 'penalty': 'l1'}, mean: -1.09290, std: 0.04502, params: {'C': 7, 'penalty': 'l2'}, mean: -1.11952, std: 0.05389, params: {'C': 9, 'penalty': 'l1'}, mean: -1.09239, std: 0.04788, params: {'C': 9, 'penalty': 'l2'}, mean: -1.12490, std: 0.05549, params: {'C': 10, 'penalty': 'l1'}, mean: -1.09272, std: 0.04911, params: {'C': 10, 'penalty': 'l2'}, mean: -1.13545, std: 0.05871, params: {'C': 12, 'penalty': 'l1'}, mean: -1.09403, std: 0.05133, params: {'C': 12, 'penalty': 'l2'}, mean: -1.14417, std: 0.06059, params: {'C': 14, 'penalty': 'l1'}, mean: -1.09584, std: 0.05321, params: {'C': 14, 'penalty': 'l2'}, mean: -1.15330, std: 0.06204, params: {'C': 16, 'penalty': 'l1'}, mean: -1.09800, std: 0.05494, params: {'C': 16, 'penalty': 'l2'}, mean: -1.16201, std: 0.06323, params: {'C': 18, 'penalty': 'l1'}, mean:

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 10.9min finished


w2v_300
[mean: -1.16571, std: 0.06918, params: {'C': 5, 'penalty': 'l1'}, mean: -1.15625, std: 0.06344, params: {'C': 5, 'penalty': 'l2'}, mean: -1.17674, std: 0.07798, params: {'C': 7, 'penalty': 'l1'}, mean: -1.15999, std: 0.06612, params: {'C': 7, 'penalty': 'l2'}, mean: -1.18680, std: 0.08370, params: {'C': 9, 'penalty': 'l1'}, mean: -1.16371, std: 0.06826, params: {'C': 9, 'penalty': 'l2'}, mean: -1.19094, std: 0.08560, params: {'C': 10, 'penalty': 'l1'}, mean: -1.16551, std: 0.06926, params: {'C': 10, 'penalty': 'l2'}, mean: -1.19787, std: 0.08872, params: {'C': 12, 'penalty': 'l1'}, mean: -1.16890, std: 0.07097, params: {'C': 12, 'penalty': 'l2'}, mean: -1.20456, std: 0.08997, params: {'C': 14, 'penalty': 'l1'}, mean: -1.17207, std: 0.07258, params: {'C': 14, 'penalty': 'l2'}, mean: -1.21048, std: 0.09003, params: {'C': 16, 'penalty': 'l1'}, mean: -1.17504, std: 0.07396, params: {'C': 16, 'penalty': 'l2'}, mean: -1.21810, std: 0.09133, params: {'C': 18, 'penalty': 'l1'}, mean: -



In [13]:
clf_dt=RandomForestClassifier()
param_test={
    "max_depth":[11,13,15,17,19],
    "n_estimators":[250,300,350]
}
for name in ["bio","tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_dt, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio : {'max_depth': 15, 'n_estimators': 300} -0.957419688259 ok
#tfidf : {'max_depth': 13, 'n_estimators': 350} -0.920825809843 ok 
#w2v :{'max_depth': 15, 'n_estimators': 300} -0.972048498265

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  3.7min finished


bio
[mean: -0.98921, std: 0.03374, params: {'max_depth': 11, 'n_estimators': 250}, mean: -0.97999, std: 0.02848, params: {'max_depth': 11, 'n_estimators': 300}, mean: -0.97820, std: 0.02786, params: {'max_depth': 11, 'n_estimators': 350}, mean: -0.96592, std: 0.03615, params: {'max_depth': 13, 'n_estimators': 250}, mean: -0.96483, std: 0.03763, params: {'max_depth': 13, 'n_estimators': 300}, mean: -0.96381, std: 0.03571, params: {'max_depth': 13, 'n_estimators': 350}, mean: -0.96159, std: 0.04092, params: {'max_depth': 15, 'n_estimators': 250}, mean: -0.96310, std: 0.04660, params: {'max_depth': 15, 'n_estimators': 300}, mean: -0.95990, std: 0.04023, params: {'max_depth': 15, 'n_estimators': 350}, mean: -0.99005, std: 0.05035, params: {'max_depth': 17, 'n_estimators': 250}, mean: -0.97091, std: 0.04292, params: {'max_depth': 17, 'n_estimators': 300}, mean: -0.98030, std: 0.05839, params: {'max_depth': 17, 'n_estimators': 350}, mean: -1.00936, std: 0.07973, params: {'max_depth': 19, 'n_

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  2.6min finished


tfidf_100
[mean: -0.94322, std: 0.03245, params: {'max_depth': 11, 'n_estimators': 250}, mean: -0.94210, std: 0.02997, params: {'max_depth': 11, 'n_estimators': 300}, mean: -0.94002, std: 0.02926, params: {'max_depth': 11, 'n_estimators': 350}, mean: -0.93217, std: 0.03938, params: {'max_depth': 13, 'n_estimators': 250}, mean: -0.93579, std: 0.04141, params: {'max_depth': 13, 'n_estimators': 300}, mean: -0.92083, std: 0.03438, params: {'max_depth': 13, 'n_estimators': 350}, mean: -0.93005, std: 0.04395, params: {'max_depth': 15, 'n_estimators': 250}, mean: -0.92554, std: 0.04157, params: {'max_depth': 15, 'n_estimators': 300}, mean: -0.92540, std: 0.03870, params: {'max_depth': 15, 'n_estimators': 350}, mean: -0.96134, std: 0.07547, params: {'max_depth': 17, 'n_estimators': 250}, mean: -0.93025, std: 0.04053, params: {'max_depth': 17, 'n_estimators': 300}, mean: -0.93967, std: 0.05281, params: {'max_depth': 17, 'n_estimators': 350}, mean: -0.97240, std: 0.05365, params: {'max_depth': 1

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  4.6min finished


w2v_300
[mean: -0.99520, std: 0.02373, params: {'max_depth': 11, 'n_estimators': 250}, mean: -0.99469, std: 0.02825, params: {'max_depth': 11, 'n_estimators': 300}, mean: -0.99142, std: 0.02944, params: {'max_depth': 11, 'n_estimators': 350}, mean: -0.98497, std: 0.03561, params: {'max_depth': 13, 'n_estimators': 250}, mean: -0.99550, std: 0.04389, params: {'max_depth': 13, 'n_estimators': 300}, mean: -0.98181, std: 0.03716, params: {'max_depth': 13, 'n_estimators': 350}, mean: -0.98066, std: 0.03439, params: {'max_depth': 15, 'n_estimators': 250}, mean: -0.98449, std: 0.03527, params: {'max_depth': 15, 'n_estimators': 300}, mean: -0.98009, std: 0.03965, params: {'max_depth': 15, 'n_estimators': 350}, mean: -0.98868, std: 0.04860, params: {'max_depth': 17, 'n_estimators': 250}, mean: -1.00925, std: 0.04909, params: {'max_depth': 17, 'n_estimators': 300}, mean: -0.97676, std: 0.03047, params: {'max_depth': 17, 'n_estimators': 350}, mean: -1.02597, std: 0.08931, params: {'max_depth': 19,



# TRAINING PHASE ALL ALGOS 

In [23]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5

    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("new_scores/new_stack_test/new_stem_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb

    y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
    subm1 = pd.DataFrame(y_pred, columns=classes)
    subm1['ID'] = ID_train
    subm1.to_csv("new_scores/new_stack_train/new_stem_{}.csv".format(file),index=False)


In [138]:
dic_xgb={"xgb_bio":XGBClassifier(colsample_bytree=0.8,max_depth=7,min_child_weight=3,n_estimators=100,subsample=0.8,objective="multi:softprob",seed=26),
        "xgb_tfidf":XGBClassifier(colsample_bytree=0.8,max_depth=5,min_child_weight=3,n_estimators=100,subsample=1,objective="multi:softprob",seed=26),
        "xgb_w2v":XGBClassifier(colsample_bytree=0.8,max_depth=5,min_child_weight=3,n_estimators=100,subsample=0.8,objective="multi:softprob",seed=26)}

dic_lgbm={"lgbm_bio":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26),
        "lgbm_tfidf":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26),
        "lgbm_w2v":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26)}
dic_lr={"lr_bio":LogisticRegression(C=10,penalty="l1"),
        "lr_tfidf":LogisticRegression(C=10,penalty="l1"),
        "lr_w2v":LogisticRegression(C=1,penalty="l1")}
dic_ada={"ada_bio":AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm="SAMME.R", random_state=26),
        "ada_tfidf":AdaBoostClassifier(n_estimators=70, learning_rate=0.01, algorithm="SAMME.R", random_state=26),
        "ada_w2v":AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm="SAMME.R", random_state=26)}
dic_rf={"rf_bio":RandomForestClassifier(n_estimators=400,max_depth=25,random_state=26),
        "rf_tfidf":RandomForestClassifier(n_estimators=400,max_depth=20,random_state=26),
        "rf_w2v":RandomForestClassifier(n_estimators=300,max_depth=15,random_state=26)}

print("xgboost here")
for clf,name in zip(dic_xgb.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_xgb[clf],file=clf)
print("lgbm here")
for clf,name in zip(dic_lgbm.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_lgbm[clf],file=clf)
print("logreg here")
for clf,name in zip(dic_lr.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_lr[clf],file=clf)
print("adaboost here")
for clf,name in zip(dic_ada.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_ada[clf],file=clf)
print("random forest here")
for clf,name in zip(dic_rf.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_rf[clf],file=clf)



xgboost here
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 243) (743, 243)
Fold 2 (2949, 243) (740, 243)
Fold 3 (2952, 243) (737, 243)
Fold 4 (2954, 243) (735, 243)
Fold 5 (2955, 243) (734, 243)
cross_val sur train 
lgbm here
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 243) (743, 243)
Fold 2 (2949, 243) (740, 243)
Fold 3 (2952, 243) (737, 243)
Fold 4 (2954, 243) (735, 243)
Fo

In [None]:
work_train_final