In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier



# I will use cross_val_score on XGBoost to select 100,200 or 300 for each preprocess

In [29]:
work_train_w2v = {} 
work_test_w2v = {}
pre_process=["w2v_100.csv","w2v_200.csv","w2v_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [30]:
work_train_d2v = {} 
work_test_d2v = {}
pre_process=["d2v_100.csv","d2v_200.csv","d2v_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_d2v[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test_d2v[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [28]:
work_train_tfidf = {} 
work_test_tfidf = {}
pre_process=["tfidf_tsvd_100.csv","tfidf_tsvd_200.csv","tfidf_tsvd_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [31]:
new_train_v=pd.read_csv("../bases/new_training_variants.csv",header=None)

In [32]:
y=np.array(new_train_v.iloc[:,0])-1

In [33]:
train = pd.read_csv('../bases/new_training_variants.csv').reset_index()
train.columns=[["Tempo_ID","Class"]]
test = pd.read_csv('../bases/new_test_variants.csv')
ID_train=train.Tempo_ID
ID_test=test.ID
del train,test

In [34]:
kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)

In [35]:
clf_xgb=XGBClassifier(max_depth=5, objective="multi:softprob",seed=26)
for name in work_train_w2v:
    h=cross_val_score(clf_xgb,np.array(work_train_w2v[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meanw2v_100 -0.895570431786 std:w2v_100 0.0548596096564
meanw2v_200 -0.90597903088 std:w2v_200 0.0509971236521
meanw2v_300 -0.913247864735 std:w2v_300 0.040367728267


In [36]:
for name in work_train_d2v:
    h=cross_val_score(clf_xgb,np.array(work_train_d2v[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meand2v_100 -1.02297684649 std:d2v_100 0.0567970281113
meand2v_200 -1.02919691932 std:d2v_200 0.050459167505
meand2v_300 -1.03101971684 std:d2v_300 0.0491266926944


In [37]:
for name in work_train_tfidf:
    h=cross_val_score(clf_xgb,np.array(work_train_tfidf[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meantfidf_tsvd_100 -0.889870181278 std:tfidf_tsvd_100 0.0535310479521
meantfidf_tsvd_200 -0.903077065542 std:tfidf_tsvd_200 0.045128434583
meantfidf_tsvd_300 -0.905181327301 std:tfidf_tsvd_300 0.0512618176122


# GRID SEARCH PHASE ALL ALGOS

In [52]:
work_train= {} 
work_test = {}
pre_process=["d2v_100.csv","tfidf_tsvd_100.csv","w2v_100.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [42]:
clf_xgboost=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5,7],
    "min_child_weight" : [1,3],
    "n_estimators" : [100,200],
    "subsample":[0.8,1],
    "colsample_bytree":[0.8,1]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_xgboost, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'colsample_bytree': 0.8, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}
#tfidf : {'colsample_bytree': 0.8, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}
#w2v : {'colsample_bytree': 1, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 48.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 67.5min finished


d2v_100
[mean: -1.10183, std: 0.04434, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -1.11608, std: 0.03952, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -1.03937, std: 0.05671, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -1.06731, std: 0.05026, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -1.10267, std: 0.04157, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -1.11616, std: 0.03778, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -1.04035, std: 0.05765, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 67.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 94.2min finished


tfidf_tsvd_100
[mean: -0.91554, std: 0.04153, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.92413, std: 0.04066, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -0.88847, std: 0.05802, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -0.88659, std: 0.05416, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -0.91610, std: 0.04252, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.92441, std: 0.03528, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -0.88585, std: 0.05543, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsamp

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 90.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 125.8min finished


w2v_100
[mean: -0.93032, std: 0.03761, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.94576, std: 0.03704, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -0.89707, std: 0.04870, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -0.90587, std: 0.04871, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -0.93094, std: 0.03437, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.94363, std: 0.03466, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -0.89250, std: 0.04405, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.



In [44]:
clf_lgbm=LGBMClassifier(seed=26)
param_test= {
    'n_estimators': [8,24,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'colsample_bytree' : [0.7,0.8],
    'subsample' : [0.7,0.8]
    }
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_lgbm, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#tfidf : {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#w2v : {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.5min finished


d2v_100
[mean: -1.71471, std: 0.00934, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.71198, std: 0.00720, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.65478, std: 0.02224, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.65026, std: 0.01736, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.63703, std: 0.01133, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.63596, std: 0.02267, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.62014, std: 0.01198, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 2

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  8.5min finished


tfidf_tsvd_100
[mean: -1.59855, std: 0.01637, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.59189, std: 0.01554, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.53482, std: 0.01789, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.52955, std: 0.01667, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.51620, std: 0.01840, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.50908, std: 0.01384, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.49688, std: 0.01829, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_lea

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 10.7min finished


w2v_100
[mean: -1.61438, std: 0.01175, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.61260, std: 0.00565, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.55027, std: 0.01063, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.54867, std: 0.00930, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.52788, std: 0.01277, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.52228, std: 0.01232, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.50684, std: 0.01323, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 2



In [45]:
clf_log=LogisticRegression()
param_test= {
    "C" : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty" : ["l1","l2"]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_log, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'C': 10, 'penalty': 'l1'}
#tfidf : {'C': 10, 'penalty': 'l1'}
#w2v : {'C': 10, 'penalty': 'l1'}

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   43.4s finished


d2v_100
[mean: -1.86803, std: 0.03398, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.85301, std: 0.03435, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.85695, std: 0.03592, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.76132, std: 0.02708, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.48878, std: 0.03079, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.70851, std: 0.02299, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.28133, std: 0.02638, params: {'C': 1, 'penalty': 'l1'}, mean: -1.62934, std: 0.04810, params: {'C': 1, 'penalty': 'l2'}, mean: -1.26483, std: 0.03273, params: {'C': 10, 'penalty': 'l1'}, mean: -1.62548, std: 0.04035, params: {'C': 10, 'penalty': 'l2'}, mean: -1.31990, std: 0.04065, params: {'C': 100, 'penalty': 'l1'}, mean: -1.64046, std: 0.03516, params: {'C': 100, 'penalty': 'l2'}, mean: -1.41898, std: 0.06563, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.63924, std: 0.03499, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-1.2648347351
Fitting 5 

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  2.0min finished


tfidf_tsvd_100
[mean: -1.86803, std: 0.03398, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.84770, std: 0.03393, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.85690, std: 0.03590, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.72502, std: 0.03332, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.37320, std: 0.03032, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.54961, std: 0.03515, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.04443, std: 0.03156, params: {'C': 1, 'penalty': 'l1'}, mean: -1.50064, std: 0.04759, params: {'C': 1, 'penalty': 'l2'}, mean: -0.97721, std: 0.03534, params: {'C': 10, 'penalty': 'l1'}, mean: -1.46547, std: 0.04555, params: {'C': 10, 'penalty': 'l2'}, mean: -1.11089, std: 0.06555, params: {'C': 100, 'penalty': 'l1'}, mean: -1.47910, std: 0.05720, params: {'C': 100, 'penalty': 'l2'}, mean: -1.25983, std: 0.09510, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.47220, std: 0.03731, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-0.977210584672
F

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  3.6min finished


w2v_100
[mean: -1.86803, std: 0.03397, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.73130, std: 0.02830, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.79844, std: 0.03595, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.49137, std: 0.02284, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.32659, std: 0.03685, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.34454, std: 0.01237, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.07196, std: 0.04045, params: {'C': 1, 'penalty': 'l1'}, mean: -1.28040, std: 0.04270, params: {'C': 1, 'penalty': 'l2'}, mean: -1.06280, std: 0.05567, params: {'C': 10, 'penalty': 'l1'}, mean: -1.28403, std: 0.04111, params: {'C': 10, 'penalty': 'l2'}, mean: -1.16657, std: 0.09201, params: {'C': 100, 'penalty': 'l1'}, mean: -1.29447, std: 0.02789, params: {'C': 100, 'penalty': 'l2'}, mean: -1.23551, std: 0.11745, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.27845, std: 0.04311, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-1.06279894516




In [46]:
clf_ada=AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26)
param_test={
    "n_estimators":[50,100],
    "learning_rate":[0.3,0.5,0.9,1]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_ada, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'learning_rate': 0.3, 'n_estimators': 50}
#tfidf : {'learning_rate': 0.3, 'n_estimators': 50}
#w2v : {'learning_rate': 0.3, 'n_estimators': 50}


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   46.5s finished


d2v_100
[mean: -1.94169, std: 0.00465, params: {'learning_rate': 0.3, 'n_estimators': 50}, mean: -1.99920, std: 0.01060, params: {'learning_rate': 0.3, 'n_estimators': 100}, mean: -1.96065, std: 0.01302, params: {'learning_rate': 0.5, 'n_estimators': 50}, mean: -2.01419, std: 0.01107, params: {'learning_rate': 0.5, 'n_estimators': 100}, mean: -2.00529, std: 0.02161, params: {'learning_rate': 0.9, 'n_estimators': 50}, mean: -2.04604, std: 0.01959, params: {'learning_rate': 0.9, 'n_estimators': 100}, mean: -2.00737, std: 0.01516, params: {'learning_rate': 1, 'n_estimators': 50}, mean: -2.05062, std: 0.01401, params: {'learning_rate': 1, 'n_estimators': 100}]
{'learning_rate': 0.3, 'n_estimators': 50}
-1.94168942506
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.6min finished


tfidf_tsvd_100
[mean: -1.91750, std: 0.01409, params: {'learning_rate': 0.3, 'n_estimators': 50}, mean: -1.94188, std: 0.01189, params: {'learning_rate': 0.3, 'n_estimators': 100}, mean: -1.95182, std: 0.01644, params: {'learning_rate': 0.5, 'n_estimators': 50}, mean: -1.97111, std: 0.01814, params: {'learning_rate': 0.5, 'n_estimators': 100}, mean: -2.00289, std: 0.02692, params: {'learning_rate': 0.9, 'n_estimators': 50}, mean: -2.06171, std: 0.07002, params: {'learning_rate': 0.9, 'n_estimators': 100}, mean: -2.03082, std: 0.03772, params: {'learning_rate': 1, 'n_estimators': 50}, mean: -2.07098, std: 0.04586, params: {'learning_rate': 1, 'n_estimators': 100}]
{'learning_rate': 0.3, 'n_estimators': 50}
-1.91749502231
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.6min finished


w2v_100
[mean: -1.91682, std: 0.00978, params: {'learning_rate': 0.3, 'n_estimators': 50}, mean: -1.94374, std: 0.00987, params: {'learning_rate': 0.3, 'n_estimators': 100}, mean: -1.95939, std: 0.03143, params: {'learning_rate': 0.5, 'n_estimators': 50}, mean: -1.99379, std: 0.04053, params: {'learning_rate': 0.5, 'n_estimators': 100}, mean: -2.02985, std: 0.02044, params: {'learning_rate': 0.9, 'n_estimators': 50}, mean: -2.03370, std: 0.01915, params: {'learning_rate': 0.9, 'n_estimators': 100}, mean: -2.04482, std: 0.02814, params: {'learning_rate': 1, 'n_estimators': 50}, mean: -2.09304, std: 0.04078, params: {'learning_rate': 1, 'n_estimators': 100}]
{'learning_rate': 0.3, 'n_estimators': 50}
-1.91681966898




In [47]:
clf_dt=RandomForestClassifier()
param_test={
    "max_depth":[5,7,10,15],
    "n_estimators":[100,200,300]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_dt, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'max_depth': 15, 'n_estimators': 300}
#tfidf : {'max_depth': 15, 'n_estimators': 300}
#w2v : {'max_depth': 15, 'n_estimators': 200}

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.3min finished


d2v_100
[mean: -1.55450, std: 0.01645, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.55385, std: 0.01623, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.55669, std: 0.01442, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.46899, std: 0.02145, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.46709, std: 0.01621, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.46734, std: 0.02178, params: {'max_depth': 7, 'n_estimators': 300}, mean: -1.36696, std: 0.02447, params: {'max_depth': 10, 'n_estimators': 100}, mean: -1.35802, std: 0.02643, params: {'max_depth': 10, 'n_estimators': 200}, mean: -1.35845, std: 0.02639, params: {'max_depth': 10, 'n_estimators': 300}, mean: -1.23195, std: 0.03798, params: {'max_depth': 15, 'n_estimators': 100}, mean: -1.22704, std: 0.03746, params: {'max_depth': 15, 'n_estimators': 200}, mean: -1.22608, std: 0.04033, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 15, 'n_estimators': 300}
-1.22607533435
Fitting

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.8min finished


tfidf_tsvd_100
[mean: -1.35762, std: 0.02528, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.36400, std: 0.01501, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.36234, std: 0.01855, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.21549, std: 0.01840, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.22147, std: 0.02262, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.22395, std: 0.02135, params: {'max_depth': 7, 'n_estimators': 300}, mean: -1.06192, std: 0.03022, params: {'max_depth': 10, 'n_estimators': 100}, mean: -1.05693, std: 0.02365, params: {'max_depth': 10, 'n_estimators': 200}, mean: -1.05594, std: 0.02709, params: {'max_depth': 10, 'n_estimators': 300}, mean: -0.95162, std: 0.03561, params: {'max_depth': 15, 'n_estimators': 100}, mean: -0.94587, std: 0.03060, params: {'max_depth': 15, 'n_estimators': 200}, mean: -0.94415, std: 0.02992, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 15, 'n_estimators': 300}
-0.944145729649

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.7min finished


w2v_100
[mean: -1.30178, std: 0.02275, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.29955, std: 0.02268, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.29942, std: 0.02190, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.15762, std: 0.02378, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.15918, std: 0.02682, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.15403, std: 0.02480, params: {'max_depth': 7, 'n_estimators': 300}, mean: -1.02141, std: 0.03265, params: {'max_depth': 10, 'n_estimators': 100}, mean: -1.01316, std: 0.03040, params: {'max_depth': 10, 'n_estimators': 200}, mean: -1.01753, std: 0.02973, params: {'max_depth': 10, 'n_estimators': 300}, mean: -1.02948, std: 0.07607, params: {'max_depth': 15, 'n_estimators': 100}, mean: -1.01016, std: 0.04225, params: {'max_depth': 15, 'n_estimators': 200}, mean: -1.01192, std: 0.04159, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 15, 'n_estimators': 200}
-1.01015899003




# TRAINING PHASE ALL ALGOS 

In [48]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5

    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("nw_scores/nw_stack_test/nw_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb
    
    if os.path.isfile("nw_scores/nw_stack_train/nw_{}.csv".format(file)):
        print("not necessary, already done")
    else:
        y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
        subm1 = pd.DataFrame(y_pred, columns=classes)
        subm1['ID'] = ID_train
        subm1.to_csv("nw_scores/nw_stack_train/nw_{}.csv".format(file),index=False)


In [49]:
dic_xgb={"xgb_d2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_tfidf":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_w2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26)}
dic_lgbm={"lgbm_d2v":LGBMClassifier(seed=26),
        "lgbm_tfidf":LGBMClassifier(seed=26),
        "lgbm_w2v":LGBMClassifier(seed=26)}
dic_lr={"lr_d2v":LogisticRegression(),
        "lr_tfidf":LogisticRegression(),
        "lr_w2v":LogisticRegression()}
dic_ada={"ada_d2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_tfidf":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_w2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26)}
dic_rf={"rf_d2v":RandomForestClassifier(n_estimators=200,max_depth=10,random_state=26),
        "rf_tfidf":RandomForestClassifier(n_estimators=200,max_depth=10,random_state=26),
        "rf_w2v":RandomForestClassifier(n_estimators=200,max_depth=10,random_state=26)}

print("xgboost here")
for clf,name in zip(dic_xgb.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_xgb[clf],file=clf)
print("lgbm here")
for clf,name in zip(dic_lgbm.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_lgbm[clf],file=clf)
print("logreg here")
for clf,name in zip(dic_lr.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_lr[clf],file=clf)
print("adaboost here")
for clf,name in zip(dic_ada.keys(),work_train.keys()):
    model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_ada[clf],file=clf)
print("random forest here")
for clf,name in zip(dic_rf.keys(),work_train.keys()):
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_rf[clf],file=clf)


xgboost here


KeyError: 'd2v_100'

In [51]:
work_test

{'d2v_100.csv':       ID  Substitutions_var  Stop_codon_var  Fusion_var  gene_fusion_var  \
 0      1                  1               0           0              0.0   
 1      2                  0               0           0              0.0   
 2      3                  1               0           0              0.0   
 3      4                  1               0           0              0.0   
 4      5                  0               0           0              0.0   
 5      6                  1               0           0              0.0   
 6      7                  1               0           0              0.0   
 7      8                  1               0           0              0.0   
 8      9                  1               0           0              0.0   
 9     10                  1               0           0              0.0   
 10    11                  1               0           0              0.0   
 11    12                  1               0           0     