In [102]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier 

# I will use cross_val_score on XGBoost to select 100,200 or 300 for each preprocess

In [109]:
work_train_w2v = {} 
work_test_w2v = {}
pre_process=["w2v_100.csv","w2v_200.csv","w2v_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_train_"+f)
    work_test_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_test_"+f)

In [110]:
work_train_d2v = {} 
work_test_d2v = {}
pre_process=["d2v_100.csv","d2v_200.csv","d2v_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_d2v[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_train_"+f)
    work_test_d2v[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_test_"+f)

In [111]:
work_train_tfidf = {} 
work_test_tfidf = {}
pre_process=["tfidf_tsvd_100.csv","tfidf_tsvd_200.csv","tfidf_tsvd_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_train_"+f)
    work_test_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_test_"+f)

In [112]:
new_train_v=pd.read_csv("../bases/new_training_variants.csv",header=None)

In [113]:
y=np.array(new_train_v.iloc[:,0])-1

In [114]:
train = pd.read_csv('../bases/new_training_variants.csv').reset_index()
train.columns=[["Tempo_ID","Class"]]
test = pd.read_csv('../bases/new_test_variants.csv')
ID_train=train.Tempo_ID
ID_test=test.ID
del train,test

In [115]:
kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)

Start 

In [120]:
clf_xgb=XGBClassifier(max_depth=5, objective="multi:softprob",seed=26)
for name in work_train_w2v:
    h=cross_val_score(clf_xgb,np.array(work_train_w2v[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))
#200 wins

meanw2v_100 -0.9226482822 std:w2v_100 0.0461198601161
meanw2v_200 -0.921885803638 std:w2v_200 0.050552558084
meanw2v_300 -0.925538385872 std:w2v_300 0.0442563156976


In [121]:
for name in work_train_d2v:
    h=cross_val_score(clf_xgb,np.array(work_train_d2v[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meand2v_100 -1.0462248149 std:d2v_100 0.0430902269406
meand2v_200 -1.05292029529 std:d2v_200 0.0368450376392
meand2v_300 -1.05027731603 std:d2v_300 0.0412608589716


In [122]:
for name in work_train_tfidf:
    h=cross_val_score(clf_xgb,np.array(work_train_tfidf[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meantfidf_tsvd_100 -0.900147502713 std:tfidf_tsvd_100 0.0472139830118
meantfidf_tsvd_200 -0.906877603924 std:tfidf_tsvd_200 0.0525136319305
meantfidf_tsvd_300 -0.920302415821 std:tfidf_tsvd_300 0.0544607246809


# GRID SEARCH PHASE ALL ALGOS

In [123]:
work_train= {} 
work_test = {}
pre_process=["d2v_100.csv","tfidf_tsvd_100.csv","w2v_200.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_train_"+f)
    work_test[re.sub("\.csv","",f)] = pd.read_csv(path+"w_working_test_"+f)

In [125]:
clf_xgb=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5,7],
    "min_child_weight" : [1,3],
    "n_estimators" : [100,200],
    "subsample":[0.8,1],
    "colsample_bytree":[0.8,1]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_xgb, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'colsample_bytree': 0.8, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}
#tfidf: {'colsample_bytree': 0.8, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}
#w2v: {'colsample_bytree': 0.8, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 47.8min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 67.7min finished


d2v_100
[mean: -1.11108, std: 0.03354, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -1.13054, std: 0.03580, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -1.05929, std: 0.04329, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -1.08905, std: 0.04607, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -1.11455, std: 0.03351, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -1.13118, std: 0.03133, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -1.06214, std: 0.04218, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 68.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 95.3min finished


tfidf_tsvd_100
[mean: -0.94909, std: 0.04060, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.96358, std: 0.04977, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -0.90465, std: 0.05220, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -0.91299, std: 0.05994, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -0.95030, std: 0.04348, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.96771, std: 0.04616, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -0.90046, std: 0.05170, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsamp

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 93.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 128.2min finished


w2v_200
[mean: -0.96871, std: 0.03157, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.98318, std: 0.03451, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -0.93639, std: 0.04124, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -0.94309, std: 0.04233, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -0.96978, std: 0.03174, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.98225, std: 0.03673, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -0.93131, std: 0.03714, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.



In [128]:
y

array([0, 1, 1, ..., 5, 3, 0], dtype=int64)

In [131]:
clf_lgbm=LGBMClassifier(seed=26)
param_test= {
    'n_estimators': [8,24,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'colsample_bytree' : [0.7,0.8],
    'subsample' : [0.7,0.8]
    }
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_lgbm, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#tfidf : {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#w2v : {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.0min finished


d2v_100
[mean: -1.72350, std: 0.01316, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.72508, std: 0.00966, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.66288, std: 0.00929, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.65835, std: 0.00994, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.64491, std: 0.01397, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.64375, std: 0.01002, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.63286, std: 0.01207, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 2

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  7.7min finished


tfidf_tsvd_100
[mean: -1.64118, std: 0.00932, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.63419, std: 0.00772, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.57364, std: 0.01306, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.57359, std: 0.01070, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.55829, std: 0.01045, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.55383, std: 0.01321, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.54267, std: 0.00954, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_lea

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 10.4min finished


w2v_200
[mean: -1.65070, std: 0.01579, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.64808, std: 0.01366, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.58476, std: 0.01731, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.58362, std: 0.01455, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.56352, std: 0.01732, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.56201, std: 0.01587, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.54358, std: 0.01574, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 2



In [132]:
clf_log=LogisticRegression()
param_test= {
    "C" : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty" : ["l1","l2"]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_log, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'C': 10, 'penalty': 'l1'}
#tfidf : {'C': 10, 'penalty': 'l1'}
#w2v : {'C': 1, 'penalty': 'l1'}

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   32.0s finished


d2v_100
[mean: -1.87995, std: 0.03151, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.85573, std: 0.03068, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.84739, std: 0.03577, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.72020, std: 0.02569, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.48798, std: 0.03349, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.46791, std: 0.02718, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.29035, std: 0.02694, params: {'C': 1, 'penalty': 'l1'}, mean: -1.35001, std: 0.03076, params: {'C': 1, 'penalty': 'l2'}, mean: -1.27313, std: 0.03242, params: {'C': 10, 'penalty': 'l1'}, mean: -1.33610, std: 0.02708, params: {'C': 10, 'penalty': 'l2'}, mean: -1.32024, std: 0.04357, params: {'C': 100, 'penalty': 'l1'}, mean: -1.33869, std: 0.02864, params: {'C': 100, 'penalty': 'l2'}, mean: -1.42946, std: 0.07314, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.33801, std: 0.03407, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-1.27313317058
Fitting 5

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  1.0min finished


tfidf_tsvd_100
[mean: -1.87995, std: 0.03151, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.85080, std: 0.03090, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.84741, std: 0.03574, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.68271, std: 0.02861, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.42765, std: 0.03549, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.36800, std: 0.03071, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.07534, std: 0.02954, params: {'C': 1, 'penalty': 'l1'}, mean: -1.21761, std: 0.04295, params: {'C': 1, 'penalty': 'l2'}, mean: -1.02607, std: 0.04714, params: {'C': 10, 'penalty': 'l1'}, mean: -1.15177, std: 0.02132, params: {'C': 10, 'penalty': 'l2'}, mean: -1.15456, std: 0.08551, params: {'C': 100, 'penalty': 'l1'}, mean: -1.17112, std: 0.02345, params: {'C': 100, 'penalty': 'l2'}, mean: -1.28787, std: 0.09577, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.16609, std: 0.02813, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-1.02606986663
Fi

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  5.2min finished


w2v_200
[mean: -1.87995, std: 0.03150, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.68379, std: 0.02716, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.79280, std: 0.03501, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.41419, std: 0.02401, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.32184, std: 0.02976, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.20606, std: 0.02624, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.08685, std: 0.03299, params: {'C': 1, 'penalty': 'l1'}, mean: -1.11367, std: 0.03000, params: {'C': 1, 'penalty': 'l2'}, mean: -1.17279, std: 0.05550, params: {'C': 10, 'penalty': 'l1'}, mean: -1.09626, std: 0.02745, params: {'C': 10, 'penalty': 'l2'}, mean: -1.43475, std: 0.10653, params: {'C': 100, 'penalty': 'l1'}, mean: -1.11114, std: 0.03105, params: {'C': 100, 'penalty': 'l2'}, mean: -1.52478, std: 0.12183, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.11751, std: 0.02290, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 1, 'penalty': 'l1'}
-1.08684517019




In [None]:
clf_ada=AdaBoostClassifier(n_estimators=100, learning_rate=0.3, algorithm="SAMME.R", random_state=26)
param_test={
    "n_estimators":[50,70,100,200],
    "learning_rate":[0.01,0.05,0.1,0.2]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_ada, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'learning_rate': 0.01, 'n_estimators': 50}
#tfidf : {'learning_rate': 0.01, 'n_estimators': 70}
#w2v :

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.8min finished


d2v_100
[mean: -1.65296, std: 0.01054, params: {'learning_rate': 0.01, 'n_estimators': 50}, mean: -1.66202, std: 0.01171, params: {'learning_rate': 0.01, 'n_estimators': 70}, mean: -1.67707, std: 0.00837, params: {'learning_rate': 0.01, 'n_estimators': 100}, mean: -1.74868, std: 0.00589, params: {'learning_rate': 0.01, 'n_estimators': 200}, mean: -1.76750, std: 0.01190, params: {'learning_rate': 0.05, 'n_estimators': 50}, mean: -1.81192, std: 0.00694, params: {'learning_rate': 0.05, 'n_estimators': 70}, mean: -1.86908, std: 0.00601, params: {'learning_rate': 0.05, 'n_estimators': 100}, mean: -1.95856, std: 0.01512, params: {'learning_rate': 0.05, 'n_estimators': 200}, mean: -1.85466, std: 0.00835, params: {'learning_rate': 0.1, 'n_estimators': 50}, mean: -1.89708, std: 0.01286, params: {'learning_rate': 0.1, 'n_estimators': 70}, mean: -1.94094, std: 0.01524, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -2.00375, std: 0.02070, params: {'learning_rate': 0.1, 'n_estimators':

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  4.5min finished


tfidf_tsvd_100
[mean: -1.69119, std: 0.01350, params: {'learning_rate': 0.01, 'n_estimators': 50}, mean: -1.67960, std: 0.01474, params: {'learning_rate': 0.01, 'n_estimators': 70}, mean: -1.68008, std: 0.01395, params: {'learning_rate': 0.01, 'n_estimators': 100}, mean: -1.75931, std: 0.01020, params: {'learning_rate': 0.01, 'n_estimators': 200}, mean: -1.77863, std: 0.00945, params: {'learning_rate': 0.05, 'n_estimators': 50}, mean: -1.83377, std: 0.00477, params: {'learning_rate': 0.05, 'n_estimators': 70}, mean: -1.88622, std: 0.00826, params: {'learning_rate': 0.05, 'n_estimators': 100}, mean: -1.96414, std: 0.01184, params: {'learning_rate': 0.05, 'n_estimators': 200}, mean: -1.87143, std: 0.00810, params: {'learning_rate': 0.1, 'n_estimators': 50}, mean: -1.91114, std: 0.01360, params: {'learning_rate': 0.1, 'n_estimators': 70}, mean: -1.94986, std: 0.01225, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -1.99855, std: 0.01736, params: {'learning_rate': 0.1, 'n_estim

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  7.9min finished


w2v_200
[mean: -1.70180, std: 0.00976, params: {'learning_rate': 0.01, 'n_estimators': 50}, mean: -1.70363, std: 0.00857, params: {'learning_rate': 0.01, 'n_estimators': 70}, mean: -1.71454, std: 0.00808, params: {'learning_rate': 0.01, 'n_estimators': 100}, mean: -1.77034, std: 0.00644, params: {'learning_rate': 0.01, 'n_estimators': 200}, mean: -1.78299, std: 0.00727, params: {'learning_rate': 0.05, 'n_estimators': 50}, mean: -1.82462, std: 0.00758, params: {'learning_rate': 0.05, 'n_estimators': 70}, mean: -1.86353, std: 0.00917, params: {'learning_rate': 0.05, 'n_estimators': 100}, mean: -1.92495, std: 0.00813, params: {'learning_rate': 0.05, 'n_estimators': 200}, mean: -1.85202, std: 0.01046, params: {'learning_rate': 0.1, 'n_estimators': 50}, mean: -1.88328, std: 0.01044, params: {'learning_rate': 0.1, 'n_estimators': 70}, mean: -1.91424, std: 0.00878, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -1.95399, std: 0.00984, params: {'learning_rate': 0.1, 'n_estimators':



In [None]:
clf_dt=RandomForestClassifier()
param_test={
    "max_depth":[5,7,10,15],
    "n_estimators":[100,200,300]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_dt, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : 10 depth,200 estimators
#tfidf : 5, 300estimators
#w2v : 5,300

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   44.6s finished


d2v_100
[mean: -1.55826, std: 0.01733, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.55886, std: 0.01616, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.55673, std: 0.01368, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.48254, std: 0.01626, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.47604, std: 0.02155, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.47401, std: 0.01746, params: {'max_depth': 7, 'n_estimators': 300}, mean: -1.37783, std: 0.02589, params: {'max_depth': 10, 'n_estimators': 100}, mean: -1.36793, std: 0.02425, params: {'max_depth': 10, 'n_estimators': 200}, mean: -1.36954, std: 0.02266, params: {'max_depth': 10, 'n_estimators': 300}, mean: -1.25363, std: 0.03150, params: {'max_depth': 15, 'n_estimators': 100}, mean: -1.25172, std: 0.03008, params: {'max_depth': 15, 'n_estimators': 200}, mean: -1.24937, std: 0.03007, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 15, 'n_estimators': 300}
-1.249370199
Fitting 5

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.5min finished


tfidf_tsvd_100
[mean: -1.41084, std: 0.02028, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.40828, std: 0.01908, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.41004, std: 0.02011, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.27572, std: 0.02359, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.27412, std: 0.02471, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.27264, std: 0.02082, params: {'max_depth': 7, 'n_estimators': 300}, mean: -1.12685, std: 0.03308, params: {'max_depth': 10, 'n_estimators': 100}, mean: -1.12175, std: 0.02698, params: {'max_depth': 10, 'n_estimators': 200}, mean: -1.12086, std: 0.02836, params: {'max_depth': 10, 'n_estimators': 300}, mean: -1.01155, std: 0.04847, params: {'max_depth': 15, 'n_estimators': 100}, mean: -1.00405, std: 0.04203, params: {'max_depth': 15, 'n_estimators': 200}, mean: -0.99328, std: 0.03083, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 15, 'n_estimators': 300}
-0.993284313502

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.1min finished


# TRAINING PHASE ALL ALGOS 

In [8]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5

    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("w_scores/w_stack_test/w_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb
    
    if os.path.isfile("w_scores/w_stack_train/w_{}.csv".format(file)):
        print("not necessary, already done")
    else:
        y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
        subm1 = pd.DataFrame(y_pred, columns=classes)
        subm1['ID'] = ID_train
        subm1.to_csv("w_scores/w_stack_train/w_{}.csv".format(file),index=False)


In [None]:
dic_xgb={"xgb_d2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_tfidf":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_w2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26)}
dic_lgbm={"lgbm_d2v":LGBMClassifier(seed=26),
        "lgbm_tfidf":LGBMClassifier(seed=26),
        "lgbm_w2v":LGBMClassifier(seed=26)}
dic_lr={"lr_d2v":LogisticRegression(),
        "lr_tfidf":LogisticRegression(),
        "lr_w2v":LogisticRegression()}
dic_ada={"ada_d2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_tfidf":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_w2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26)}
dic_rf={"rf_d2v":RandomForestClassifier(n_estimators=200,max_depth=10,random_state=26),
        "rf_tfidf":RandomForestClassifier(n_estimators=200,max_depth=10,random_state=26),
        "rf_w2v":RandomForestClassifier(n_estimators=200,max_depth=10,random_state=26)}

print("xgboost here")
for clf,name in zip(dic_xgb.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_xgb[clf],file=clf)
print("lgbm here")
for clf,name in zip(dic_lgbm.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_lgbm[clf],file=clf)
print("logreg here")
for clf,name in zip(dic_lr.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_lr[clf],file=clf)
print("adaboost here")
for clf,name in zip(dic_ada.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_ada[clf],file=clf)
print("random forest here")
for clf,name in zip(dic_rf.keys(),work_train.keys()):
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_rf[clf],file=clf)