In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier 



# I will use cross_val_score on XGBoost to select 100,200 or 300 for each preprocess

In [2]:
work_train_w2v = {} 
work_test_w2v = {}
pre_process=["w2v_100.csv","w2v_200.csv","w2v_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_train_"+f)
    work_test_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_test_"+f)

In [3]:
work_train_tfidf = {} 
work_test_tfidf = {}
pre_process=["tfidf_tsvd_100.csv","tfidf_tsvd_200.csv","tfidf_tsvd_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_train_"+f)
    work_test_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"new_stem_working_test_"+f)

In [None]:
#we keep the lemmatization bio w2v
work_train_bio = {} 
work_test_bio = {}
pre_process=["bio.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_bio[re.sub("\.csv","",f)] = pd.read_csv(path+"new_working_train_"+f)
    work_test_bio[re.sub("\.csv","",f)] = pd.read_csv(path+"new_working_test_"+f)

In [2]:
new_train=pd.read_csv("checkpoints_databases/new_working_train.csv")
new_test=pd.read_csv("checkpoints_databases/new_working_test.csv")

In [3]:
y=np.array(new_train.Class)-1

In [4]:
ID_train=new_train.ID
ID_test=new_test.ID


In [5]:
kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)

Start 

In [8]:
clf_xgb=XGBClassifier(max_depth=5, objective="multi:softprob",seed=26)
for name in work_train_w2v:
    h=cross_val_score(clf_xgb,np.array(work_train_w2v[name]),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))
#

meanw2v_100 -1.08591915345 std:w2v_100 0.0541873570702
meanw2v_200 -1.07376934966 std:w2v_200 0.0372816273367
meanw2v_300 -1.06437402745 std:w2v_300 0.0367764336005


In [9]:
for name in work_train_tfidf:
    h=cross_val_score(clf_xgb,np.array(work_train_tfidf[name]),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))
#

meantfidf_tsvd_100 -1.0169704023 std:tfidf_tsvd_100 0.0365495778442
meantfidf_tsvd_200 -1.02394548835 std:tfidf_tsvd_200 0.0384426689575
meantfidf_tsvd_300 -1.02594316006 std:tfidf_tsvd_300 0.0390292436386


# GRID SEARCH PHASE ALL ALGOS

In [7]:
work_train= {} 
work_test = {}
#To complete
path="checkpoints_databases/"
work_train["bio"] = pd.read_csv(path+"new_working_train_bio.csv")
work_test["bio"] = pd.read_csv(path+"new_working_test_bio.csv")
work_train["tfidf_100"] = pd.read_csv(path+"new_working_train_tfidf_tsvd_100.csv")
work_test["tfidf_100"] = pd.read_csv(path+"new_working_test_tfidf_tsvd_100.csv")
work_train["w2v_300"] = pd.read_csv(path+"new_working_train_w2v_300.csv")
work_test["w2v_300"] = pd.read_csv(path+"new_working_test_w2v_300.csv")

In [8]:
feat_train=pd.read_csv("../window/w_meta_features/meta_train_l1l2.csv")
feat_test=pd.read_csv("../window/w_meta_features/meta_test_l1l2.csv")
feat_train=feat_train.drop("ID",axis=1)
feat_test=feat_test.drop("ID",axis=1)
feat_ext_train=pd.read_csv("w_meta_features/new_working_train_ext.csv")
feat_ext_test=pd.read_csv("w_meta_features/new_working_test_ext.csv")

In [9]:
#GO features + text embeddings
work_train_final={}
work_test_final={}
for name in work_train:
    work_train_final[name]=pd.concat((work_train[name],feat_ext_train),axis=1)
    work_test_final[name]=pd.concat((work_test[name],feat_ext_test),axis=1)

# the gridsearch for w2vbio lemmatized is already done in the other modelling script

In [None]:
clf_xgb=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5,7],
    "min_child_weight" : [1,3,5],
    "n_estimators" : [100,200],
    "subsample":[0.8,0.9,1],
    "colsample_bytree":[0.8,0.9,1]
}
for name in ["tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_xgb, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio : {'colsample_bytree': 0.8, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.9} : 0.9385
#tfidf: 
#w2v: 

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 60.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 83.2min finished


bio
[mean: -0.97850, std: 0.04385, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.98926, std: 0.04419, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -0.94085, std: 0.05438, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -0.94552, std: 0.05754, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -0.97966, std: 0.04142, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.99004, std: 0.04390, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -0.93445, std: 0.05285, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.8}, 

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.7min


In [None]:
clf_lgbm=LGBMClassifier(seed=26)
param_test= {
    'n_estimators': [8,24,36,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'colsample_bytree' : [0.7,0.8,0.9],
    'subsample' : [0.7,0.8,0.9]
    }
for name in ["tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_lgbm, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio :
#tfidf : {'boosting_type': 'gbdt', 'colsample_bytree': 0.9, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#-0.93280968028
#w2v : 

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 44.5min


In [9]:
clf_log=LogisticRegression()
param_test= {
    "C" : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty" : ["l1","l2"]
}
for name in ["tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_log, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio : {'C': 10, 'penalty': 'l2'}
#tfidf : {'C': 10, 'penalty': 'l2'}
#w2v : {'C': 1, 'penalty': 'l2'}

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  4.4min finished


tfidf_100
[mean: -2.04903, std: 0.00643, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.70507, std: 0.01005, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.59588, std: 0.01875, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.39402, std: 0.01424, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.33730, std: 0.02120, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.26934, std: 0.02130, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.15802, std: 0.02450, params: {'C': 1, 'penalty': 'l1'}, mean: -1.15016, std: 0.02911, params: {'C': 1, 'penalty': 'l2'}, mean: -1.12488, std: 0.05547, params: {'C': 10, 'penalty': 'l1'}, mean: -1.09272, std: 0.04911, params: {'C': 10, 'penalty': 'l2'}, mean: -1.27968, std: 0.08625, params: {'C': 100, 'penalty': 'l1'}, mean: -1.15544, std: 0.07082, params: {'C': 100, 'penalty': 'l2'}, mean: -1.42397, std: 0.11935, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.25191, std: 0.08117, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l2'}
-1.09271753031
Fitting

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed: 54.0min finished


w2v_300
[mean: -2.04903, std: 0.00643, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.60625, std: 0.01207, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.59588, std: 0.01875, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.30283, std: 0.01969, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.29489, std: 0.02647, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.18978, std: 0.02982, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.17632, std: 0.04626, params: {'C': 1, 'penalty': 'l1'}, mean: -1.15330, std: 0.04963, params: {'C': 1, 'penalty': 'l2'}, mean: -1.19038, std: 0.08535, params: {'C': 10, 'penalty': 'l1'}, mean: -1.16551, std: 0.06926, params: {'C': 10, 'penalty': 'l2'}, mean: -1.34962, std: 0.11493, params: {'C': 100, 'penalty': 'l1'}, mean: -1.23027, std: 0.09377, params: {'C': 100, 'penalty': 'l2'}, mean: -1.56665, std: 0.17357, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.32577, std: 0.12000, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 1, 'penalty': 'l2'}
-1.15330170978




In [10]:
clf_ada=AdaBoostClassifier(n_estimators=100, learning_rate=0.3, algorithm="SAMME.R", random_state=26)
param_test={
    "n_estimators":[50,70,100,200],
    "learning_rate":[0.01,0.05,0.1,0.2]
}
for name in ["bio","tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_ada, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio : {'learning_rate': 0.01, 'n_estimators': 70}
#tfidf : 
#w2v :
#Actually adaboost learners are supebad, their CV is even inferior as single prediction model, just remove it

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.6min finished


bio
[mean: -1.63181, std: 0.01702, params: {'learning_rate': 0.01, 'n_estimators': 50}, mean: -1.62628, std: 0.01451, params: {'learning_rate': 0.01, 'n_estimators': 70}, mean: -1.63667, std: 0.00886, params: {'learning_rate': 0.01, 'n_estimators': 100}, mean: -1.71383, std: 0.00478, params: {'learning_rate': 0.01, 'n_estimators': 200}, mean: -1.74262, std: 0.00779, params: {'learning_rate': 0.05, 'n_estimators': 50}, mean: -1.79022, std: 0.00310, params: {'learning_rate': 0.05, 'n_estimators': 70}, mean: -1.83815, std: 0.00380, params: {'learning_rate': 0.05, 'n_estimators': 100}, mean: -1.91131, std: 0.00732, params: {'learning_rate': 0.05, 'n_estimators': 200}, mean: -1.83428, std: 0.00358, params: {'learning_rate': 0.1, 'n_estimators': 50}, mean: -1.87170, std: 0.00812, params: {'learning_rate': 0.1, 'n_estimators': 70}, mean: -1.90400, std: 0.01217, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -1.94304, std: 0.01631, params: {'learning_rate': 0.1, 'n_estimators': 200

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.0min finished


tfidf_100
[mean: -1.63181, std: 0.01702, params: {'learning_rate': 0.01, 'n_estimators': 50}, mean: -1.62628, std: 0.01451, params: {'learning_rate': 0.01, 'n_estimators': 70}, mean: -1.63911, std: 0.01008, params: {'learning_rate': 0.01, 'n_estimators': 100}, mean: -1.71409, std: 0.00768, params: {'learning_rate': 0.01, 'n_estimators': 200}, mean: -1.73815, std: 0.00566, params: {'learning_rate': 0.05, 'n_estimators': 50}, mean: -1.78558, std: 0.00302, params: {'learning_rate': 0.05, 'n_estimators': 70}, mean: -1.83494, std: 0.00780, params: {'learning_rate': 0.05, 'n_estimators': 100}, mean: -1.91050, std: 0.01665, params: {'learning_rate': 0.05, 'n_estimators': 200}, mean: -1.82660, std: 0.00714, params: {'learning_rate': 0.1, 'n_estimators': 50}, mean: -1.86498, std: 0.01550, params: {'learning_rate': 0.1, 'n_estimators': 70}, mean: -1.90243, std: 0.01871, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -1.95188, std: 0.01810, params: {'learning_rate': 0.1, 'n_estimators

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  5.3min finished


w2v_300
[mean: -1.63117, std: 0.01710, params: {'learning_rate': 0.01, 'n_estimators': 50}, mean: -1.62875, std: 0.01166, params: {'learning_rate': 0.01, 'n_estimators': 70}, mean: -1.63937, std: 0.00645, params: {'learning_rate': 0.01, 'n_estimators': 100}, mean: -1.70599, std: 0.00667, params: {'learning_rate': 0.01, 'n_estimators': 200}, mean: -1.72833, std: 0.00469, params: {'learning_rate': 0.05, 'n_estimators': 50}, mean: -1.77080, std: 0.00521, params: {'learning_rate': 0.05, 'n_estimators': 70}, mean: -1.82058, std: 0.00552, params: {'learning_rate': 0.05, 'n_estimators': 100}, mean: -1.89251, std: 0.01550, params: {'learning_rate': 0.05, 'n_estimators': 200}, mean: -1.81289, std: 0.00968, params: {'learning_rate': 0.1, 'n_estimators': 50}, mean: -1.84955, std: 0.01592, params: {'learning_rate': 0.1, 'n_estimators': 70}, mean: -1.88367, std: 0.01698, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -1.92261, std: 0.02117, params: {'learning_rate': 0.1, 'n_estimators':



In [None]:
clf_dt=RandomForestClassifier()
param_test={
    "max_depth":[10,15,20,25,30],
    "n_estimators":[200,300,400]
}
for name in ["bio","tfidf_100","w2v_300"]:
    gsearch=GridSearchCV(estimator=clf_dt, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name]),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#bio : 
#tfidf : {'max_depth': 15, 'n_estimators': 300} -0.927157392031
#w2v :{'max_depth': 15, 'n_estimators': 300} -0.972048498265

Fitting 5 folds for each of 15 candidates, totalling 75 fits


# TRAINING PHASE ALL ALGOS 

In [23]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5

    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("new_scores/new_stack_test/new_stem_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb

    y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
    subm1 = pd.DataFrame(y_pred, columns=classes)
    subm1['ID'] = ID_train
    subm1.to_csv("new_scores/new_stack_train/new_stem_{}.csv".format(file),index=False)


In [138]:
dic_xgb={"xgb_bio":XGBClassifier(colsample_bytree=0.8,max_depth=7,min_child_weight=3,n_estimators=100,subsample=0.8,objective="multi:softprob",seed=26),
        "xgb_tfidf":XGBClassifier(colsample_bytree=0.8,max_depth=5,min_child_weight=3,n_estimators=100,subsample=1,objective="multi:softprob",seed=26),
        "xgb_w2v":XGBClassifier(colsample_bytree=0.8,max_depth=5,min_child_weight=3,n_estimators=100,subsample=0.8,objective="multi:softprob",seed=26)}

dic_lgbm={"lgbm_bio":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26),
        "lgbm_tfidf":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26),
        "lgbm_w2v":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26)}
dic_lr={"lr_bio":LogisticRegression(C=10,penalty="l1"),
        "lr_tfidf":LogisticRegression(C=10,penalty="l1"),
        "lr_w2v":LogisticRegression(C=1,penalty="l1")}
dic_ada={"ada_bio":AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm="SAMME.R", random_state=26),
        "ada_tfidf":AdaBoostClassifier(n_estimators=70, learning_rate=0.01, algorithm="SAMME.R", random_state=26),
        "ada_w2v":AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm="SAMME.R", random_state=26)}
dic_rf={"rf_bio":RandomForestClassifier(n_estimators=400,max_depth=25,random_state=26),
        "rf_tfidf":RandomForestClassifier(n_estimators=400,max_depth=20,random_state=26),
        "rf_w2v":RandomForestClassifier(n_estimators=300,max_depth=15,random_state=26)}

print("xgboost here")
for clf,name in zip(dic_xgb.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_xgb[clf],file=clf)
print("lgbm here")
for clf,name in zip(dic_lgbm.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_lgbm[clf],file=clf)
print("logreg here")
for clf,name in zip(dic_lr.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_lr[clf],file=clf)
print("adaboost here")
for clf,name in zip(dic_ada.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_ada[clf],file=clf)
print("random forest here")
for clf,name in zip(dic_rf.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name]),X_test=np.array(work_test_final[name]),y=y,classifier=dic_rf[clf],file=clf)



xgboost here
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 243) (743, 243)
Fold 2 (2949, 243) (740, 243)
Fold 3 (2952, 243) (737, 243)
Fold 4 (2954, 243) (735, 243)
Fold 5 (2955, 243) (734, 243)
cross_val sur train 
lgbm here
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 142) (743, 142)
Fold 2 (2949, 142) (740, 142)
Fold 3 (2952, 142) (737, 142)
Fold 4 (2954, 142) (735, 142)
Fold 5 (2955, 142) (734, 142)
cross_val sur train 
Fold 1 (2946, 243) (743, 243)
Fold 2 (2949, 243) (740, 243)
Fold 3 (2952, 243) (737, 243)
Fold 4 (2954, 243) (735, 243)
Fo

In [None]:
work_train_final