In [16]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# I will use cross_val_score on XGBoost to select 100,200 or 300 for each preprocess

In [17]:
work_train_w2v = {} 
work_test_w2v = {}
pre_process=["w2v_100.csv","w2v_200.csv","w2v_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test_w2v[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [3]:
work_train_bio = {} 
work_test_bio = {}
pre_process=["bio.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_bio[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test_bio[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [4]:
work_train_tfidf = {} 
work_test_tfidf = {}
pre_process=["tfidf_tsvd_100.csv","tfidf_tsvd_200.csv","tfidf_tsvd_300.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test_tfidf[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [5]:
new_train_v=pd.read_csv("../bases/new_training_variants.csv")

In [6]:
y=np.array(new_train_v.iloc[:,0])-1

In [8]:
train = pd.read_csv('../bases/new_training_variants.csv')
test = pd.read_csv('../bases/new_test_variants.csv')
ID_train=train.ID
ID_test=test.ID
del train,test

In [9]:
kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)

In [10]:
clf_xgb=XGBClassifier(max_depth=5, objective="multi:softprob",seed=26)
for name in work_train_w2v:
    h=cross_val_score(clf_xgb,np.array(work_train_w2v[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meanw2v_100 -1.00422806424 std:w2v_100 0.0325493546696
meanw2v_200 -1.00385541522 std:w2v_200 0.0322415146009
meanw2v_300 -0.987052629045 std:w2v_300 0.0403660088829


In [11]:
for name in work_train_bio:
    h=cross_val_score(clf_xgb,np.array(work_train_bio[name].drop("ID",axis=1)),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meanbio -1.03229973699 std:bio 0.0338258622169


In [14]:
for name in work_train_tfidf:
    h=cross_val_score(clf_xgb,np.array(work_train_tfidf[name]),y,cv=kf,n_jobs=-1,scoring="neg_log_loss")
    print("mean"+name+" "+str(h.mean()),
         "std:"+name+" "+str(h.std()))

meantfidf_tsvd_100 -0.972444529011 std:tfidf_tsvd_100 0.0341352210089
meantfidf_tsvd_200 -0.989768786429 std:tfidf_tsvd_200 0.0421389970526
meantfidf_tsvd_300 -0.988125473019 std:tfidf_tsvd_300 0.045022240091


# GRID SEARCH PHASE ALL ALGOS

In [66]:
work_train= {} 
work_test = {}
pre_process=["bio.csv","tfidf_tsvd_100.csv","w2v_100.csv"]
path="checkpoints_databases/"
for f in pre_process:
    work_train[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_train_"+f)
    work_test[re.sub("\.csv","",f)] = pd.read_csv(path+"nw_working_test_"+f)

In [None]:
#also add features 

In [None]:
feat_train=pd.read_csv("../No_window/nw_meta_features/meta_train_l1l2.csv")
feat_test=pd.read_csv("../No_window/nw_meta_features/meta_test_l1l2.csv")

In [15]:
#add a "external model" xgb alone

In [None]:
ext_bio=pd.read_csv("../l2_meta_features/svd25_biological.csv")
ext_cell=pd.read_csv("../l2_meta_features/svd25_cellular.csv")
ext_mol=pd.read_csv("../l2_meta_features/svd25_molecular.csv")

In [None]:
ext_bio=ext_bio.drop(["Class","Gene","ID","Variation"],axis=1)
ext_cell=ext_cell.drop(["Class","Gene","ID","Variation"],axis=1)
ext_mol=ext_mol.drop(["Class","Gene","ID","Variation"],axis=1)

In [None]:
model_ext=pd.concat((ext_bio,ext_cell,ext_mol),axis=1)
model_ext_train=model_ext[:len(y)]
model_ext_test=model_ext[len(y):]

In [None]:
clf_xgb=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5,7],
    "min_child_weight" : [1,3],
    "n_estimators" : [100,200],
    "subsample":[0.8,1],
    "colsample_bytree":[0.8,1]
}
gsearch=GridSearchCV(estimator=clf_xgb, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
gsearch.fit(model_ext_train,y)
print(gsearch.grid_scores_)
print(gsearch.best_params_)
print(gsearch.best_score_)

In [None]:
work_train_final={}
work_test_final={}
for name in work_train:
    work_train_final[name]=pd.concat((work_train[name],feat_train),axis=1)
    work_test_final[name]=pd.concat((work_test[name],feat_test),axis=1)

In [42]:
clf_xgb=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5,7],
    "min_child_weight" : [1,3],
    "n_estimators" : [100,200],
    "subsample":[0.8,1],
    "colsample_bytree":[0.8,1]
}
for name in work_train_final:
    gsearch=GridSearchCV(estimator=clf_xgb, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name].drop("ID",axis=1)),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'colsample_bytree': 0.8, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}
#tfidf : {'colsample_bytree': 0.8, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}
#w2v : {'colsample_bytree': 1, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 48.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 67.5min finished


d2v_100
[mean: -1.10183, std: 0.04434, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -1.11608, std: 0.03952, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -1.03937, std: 0.05671, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -1.06731, std: 0.05026, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -1.10267, std: 0.04157, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -1.11616, std: 0.03778, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -1.04035, std: 0.05765, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 67.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 94.2min finished


tfidf_tsvd_100
[mean: -0.91554, std: 0.04153, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.92413, std: 0.04066, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -0.88847, std: 0.05802, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -0.88659, std: 0.05416, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -0.91610, std: 0.04252, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.92441, std: 0.03528, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -0.88585, std: 0.05543, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsamp

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 90.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 125.8min finished


w2v_100
[mean: -0.93032, std: 0.03761, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.94576, std: 0.03704, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}, mean: -0.89707, std: 0.04870, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}, mean: -0.90587, std: 0.04871, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}, mean: -0.93094, std: 0.03437, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}, mean: -0.94363, std: 0.03466, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}, mean: -0.89250, std: 0.04405, params: {'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.



In [44]:
clf_lgbm=LGBMClassifier(seed=26)
param_test= {
    'n_estimators': [8,24,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'colsample_bytree' : [0.7,0.8],
    'subsample' : [0.7,0.8]
    }
for name in work_train_final:
    gsearch=GridSearchCV(estimator=clf_lgbm, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name].drop("ID",axis=1)),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#tfidf : {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}
#w2v : {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'n_estimators': 48, 'num_leaves': 22, 'subsample': 0.8}

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.5min finished


d2v_100
[mean: -1.71471, std: 0.00934, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.71198, std: 0.00720, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.65478, std: 0.02224, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.65026, std: 0.01736, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.63703, std: 0.01133, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.63596, std: 0.02267, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.62014, std: 0.01198, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 2

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  8.5min finished


tfidf_tsvd_100
[mean: -1.59855, std: 0.01637, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.59189, std: 0.01554, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.53482, std: 0.01789, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.52955, std: 0.01667, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.51620, std: 0.01840, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.50908, std: 0.01384, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.49688, std: 0.01829, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_lea

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 10.7min finished


w2v_100
[mean: -1.61438, std: 0.01175, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.7}, mean: -1.61260, std: 0.00565, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 6, 'subsample': 0.8}, mean: -1.55027, std: 0.01063, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.7}, mean: -1.54867, std: 0.00930, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 12, 'subsample': 0.8}, mean: -1.52788, std: 0.01277, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.7}, mean: -1.52228, std: 0.01232, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 16, 'subsample': 0.8}, mean: -1.50684, std: 0.01323, params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'n_estimators': 8, 'num_leaves': 2



In [45]:
clf_log=LogisticRegression()
param_test= {
    "C" : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty" : ["l1","l2"]
}
for name in work_train_final:
    gsearch=GridSearchCV(estimator=clf_log, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name].drop("ID",axis=1)),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'C': 10, 'penalty': 'l1'}
#tfidf : {'C': 10, 'penalty': 'l1'}
#w2v : {'C': 10, 'penalty': 'l1'}

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   43.4s finished


d2v_100
[mean: -1.86803, std: 0.03398, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.85301, std: 0.03435, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.85695, std: 0.03592, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.76132, std: 0.02708, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.48878, std: 0.03079, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.70851, std: 0.02299, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.28133, std: 0.02638, params: {'C': 1, 'penalty': 'l1'}, mean: -1.62934, std: 0.04810, params: {'C': 1, 'penalty': 'l2'}, mean: -1.26483, std: 0.03273, params: {'C': 10, 'penalty': 'l1'}, mean: -1.62548, std: 0.04035, params: {'C': 10, 'penalty': 'l2'}, mean: -1.31990, std: 0.04065, params: {'C': 100, 'penalty': 'l1'}, mean: -1.64046, std: 0.03516, params: {'C': 100, 'penalty': 'l2'}, mean: -1.41898, std: 0.06563, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.63924, std: 0.03499, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-1.2648347351
Fitting 5 

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  2.0min finished


tfidf_tsvd_100
[mean: -1.86803, std: 0.03398, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.84770, std: 0.03393, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.85690, std: 0.03590, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.72502, std: 0.03332, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.37320, std: 0.03032, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.54961, std: 0.03515, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.04443, std: 0.03156, params: {'C': 1, 'penalty': 'l1'}, mean: -1.50064, std: 0.04759, params: {'C': 1, 'penalty': 'l2'}, mean: -0.97721, std: 0.03534, params: {'C': 10, 'penalty': 'l1'}, mean: -1.46547, std: 0.04555, params: {'C': 10, 'penalty': 'l2'}, mean: -1.11089, std: 0.06555, params: {'C': 100, 'penalty': 'l1'}, mean: -1.47910, std: 0.05720, params: {'C': 100, 'penalty': 'l2'}, mean: -1.25983, std: 0.09510, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.47220, std: 0.03731, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-0.977210584672
F

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  3.6min finished


w2v_100
[mean: -1.86803, std: 0.03397, params: {'C': 0.001, 'penalty': 'l1'}, mean: -1.73130, std: 0.02830, params: {'C': 0.001, 'penalty': 'l2'}, mean: -1.79844, std: 0.03595, params: {'C': 0.01, 'penalty': 'l1'}, mean: -1.49137, std: 0.02284, params: {'C': 0.01, 'penalty': 'l2'}, mean: -1.32659, std: 0.03685, params: {'C': 0.1, 'penalty': 'l1'}, mean: -1.34454, std: 0.01237, params: {'C': 0.1, 'penalty': 'l2'}, mean: -1.07196, std: 0.04045, params: {'C': 1, 'penalty': 'l1'}, mean: -1.28040, std: 0.04270, params: {'C': 1, 'penalty': 'l2'}, mean: -1.06280, std: 0.05567, params: {'C': 10, 'penalty': 'l1'}, mean: -1.28403, std: 0.04111, params: {'C': 10, 'penalty': 'l2'}, mean: -1.16657, std: 0.09201, params: {'C': 100, 'penalty': 'l1'}, mean: -1.29447, std: 0.02789, params: {'C': 100, 'penalty': 'l2'}, mean: -1.23551, std: 0.11745, params: {'C': 1000, 'penalty': 'l1'}, mean: -1.27845, std: 0.04311, params: {'C': 1000, 'penalty': 'l2'}]
{'C': 10, 'penalty': 'l1'}
-1.06279894516




In [46]:
clf_ada=AdaBoostClassifier(n_estimators=100, learning_rate=0.3, algorithm="SAMME.R", random_state=26)
param_test={
    "n_estimators":[50,70,100,200],
    "learning_rate":[0.01,0.05,0.1,0.2]
}
for name in work_train_final:
    gsearch=GridSearchCV(estimator=clf_ada, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name].drop("ID",axis=1)),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'learning_rate': 0.3, 'n_estimators': 50}
#tfidf : {'learning_rate': 0.3, 'n_estimators': 50}
#w2v : {'learning_rate': 0.3, 'n_estimators': 50}


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   46.5s finished


d2v_100
[mean: -1.94169, std: 0.00465, params: {'learning_rate': 0.3, 'n_estimators': 50}, mean: -1.99920, std: 0.01060, params: {'learning_rate': 0.3, 'n_estimators': 100}, mean: -1.96065, std: 0.01302, params: {'learning_rate': 0.5, 'n_estimators': 50}, mean: -2.01419, std: 0.01107, params: {'learning_rate': 0.5, 'n_estimators': 100}, mean: -2.00529, std: 0.02161, params: {'learning_rate': 0.9, 'n_estimators': 50}, mean: -2.04604, std: 0.01959, params: {'learning_rate': 0.9, 'n_estimators': 100}, mean: -2.00737, std: 0.01516, params: {'learning_rate': 1, 'n_estimators': 50}, mean: -2.05062, std: 0.01401, params: {'learning_rate': 1, 'n_estimators': 100}]
{'learning_rate': 0.3, 'n_estimators': 50}
-1.94168942506
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.6min finished


tfidf_tsvd_100
[mean: -1.91750, std: 0.01409, params: {'learning_rate': 0.3, 'n_estimators': 50}, mean: -1.94188, std: 0.01189, params: {'learning_rate': 0.3, 'n_estimators': 100}, mean: -1.95182, std: 0.01644, params: {'learning_rate': 0.5, 'n_estimators': 50}, mean: -1.97111, std: 0.01814, params: {'learning_rate': 0.5, 'n_estimators': 100}, mean: -2.00289, std: 0.02692, params: {'learning_rate': 0.9, 'n_estimators': 50}, mean: -2.06171, std: 0.07002, params: {'learning_rate': 0.9, 'n_estimators': 100}, mean: -2.03082, std: 0.03772, params: {'learning_rate': 1, 'n_estimators': 50}, mean: -2.07098, std: 0.04586, params: {'learning_rate': 1, 'n_estimators': 100}]
{'learning_rate': 0.3, 'n_estimators': 50}
-1.91749502231
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.6min finished


w2v_100
[mean: -1.91682, std: 0.00978, params: {'learning_rate': 0.3, 'n_estimators': 50}, mean: -1.94374, std: 0.00987, params: {'learning_rate': 0.3, 'n_estimators': 100}, mean: -1.95939, std: 0.03143, params: {'learning_rate': 0.5, 'n_estimators': 50}, mean: -1.99379, std: 0.04053, params: {'learning_rate': 0.5, 'n_estimators': 100}, mean: -2.02985, std: 0.02044, params: {'learning_rate': 0.9, 'n_estimators': 50}, mean: -2.03370, std: 0.01915, params: {'learning_rate': 0.9, 'n_estimators': 100}, mean: -2.04482, std: 0.02814, params: {'learning_rate': 1, 'n_estimators': 50}, mean: -2.09304, std: 0.04078, params: {'learning_rate': 1, 'n_estimators': 100}]
{'learning_rate': 0.3, 'n_estimators': 50}
-1.91681966898




In [67]:
clf_dt=RandomForestClassifier()
param_test={
    "max_depth":[10,15,20,25,30],
    "n_estimators":[200,300,400]
}
for name in work_train_final:
    gsearch=GridSearchCV(estimator=clf_dt, param_grid = param_test,scoring="neg_log_loss",n_jobs=-1,iid=False, cv=kf,verbose=True)
    gsearch.fit(np.array(work_train_final[name].drop("ID",axis=1)),y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : {'max_depth': 25, 'n_estimators': 300}
#tfidf : {'max_depth': 15, 'n_estimators': 300}
#w2v : {'max_depth': 15, 'n_estimators': 200}

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.3min finished


d2v_100
[mean: -1.22844, std: 0.03563, params: {'max_depth': 15, 'n_estimators': 200}, mean: -1.22791, std: 0.03435, params: {'max_depth': 15, 'n_estimators': 300}, mean: -1.22571, std: 0.03375, params: {'max_depth': 15, 'n_estimators': 400}, mean: -1.16550, std: 0.04155, params: {'max_depth': 20, 'n_estimators': 200}, mean: -1.16466, std: 0.04370, params: {'max_depth': 20, 'n_estimators': 300}, mean: -1.16096, std: 0.04186, params: {'max_depth': 20, 'n_estimators': 400}, mean: -1.15077, std: 0.04684, params: {'max_depth': 25, 'n_estimators': 200}, mean: -1.14424, std: 0.04787, params: {'max_depth': 25, 'n_estimators': 300}, mean: -1.14550, std: 0.04492, params: {'max_depth': 25, 'n_estimators': 400}, mean: -1.16388, std: 0.06146, params: {'max_depth': 30, 'n_estimators': 200}, mean: -1.15763, std: 0.04410, params: {'max_depth': 30, 'n_estimators': 300}, mean: -1.15487, std: 0.04284, params: {'max_depth': 30, 'n_estimators': 400}]
{'max_depth': 25, 'n_estimators': 300}
-1.14424083177
F

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.3min finished


tfidf_tsvd_100
[mean: -0.94831, std: 0.02960, params: {'max_depth': 15, 'n_estimators': 200}, mean: -0.94279, std: 0.03010, params: {'max_depth': 15, 'n_estimators': 300}, mean: -0.95152, std: 0.02612, params: {'max_depth': 15, 'n_estimators': 400}, mean: -0.98358, std: 0.05267, params: {'max_depth': 20, 'n_estimators': 200}, mean: -0.97815, std: 0.02881, params: {'max_depth': 20, 'n_estimators': 300}, mean: -0.97319, std: 0.02574, params: {'max_depth': 20, 'n_estimators': 400}, mean: -1.24485, std: 0.07011, params: {'max_depth': 25, 'n_estimators': 200}, mean: -1.13895, std: 0.14529, params: {'max_depth': 25, 'n_estimators': 300}, mean: -1.09501, std: 0.06563, params: {'max_depth': 25, 'n_estimators': 400}, mean: -1.47265, std: 0.11060, params: {'max_depth': 30, 'n_estimators': 200}, mean: -1.39942, std: 0.18520, params: {'max_depth': 30, 'n_estimators': 300}, mean: -1.27881, std: 0.05559, params: {'max_depth': 30, 'n_estimators': 400}]
{'max_depth': 15, 'n_estimators': 300}
-0.942789

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.9min finished


w2v_100
[mean: -1.00518, std: 0.04244, params: {'max_depth': 15, 'n_estimators': 200}, mean: -0.98611, std: 0.03042, params: {'max_depth': 15, 'n_estimators': 300}, mean: -1.00611, std: 0.04150, params: {'max_depth': 15, 'n_estimators': 400}, mean: -1.18631, std: 0.08091, params: {'max_depth': 20, 'n_estimators': 200}, mean: -1.13412, std: 0.07174, params: {'max_depth': 20, 'n_estimators': 300}, mean: -1.09728, std: 0.05019, params: {'max_depth': 20, 'n_estimators': 400}, mean: -1.38482, std: 0.07572, params: {'max_depth': 25, 'n_estimators': 200}, mean: -1.27289, std: 0.04654, params: {'max_depth': 25, 'n_estimators': 300}, mean: -1.21932, std: 0.05829, params: {'max_depth': 25, 'n_estimators': 400}, mean: -1.42201, std: 0.07977, params: {'max_depth': 30, 'n_estimators': 200}, mean: -1.35885, std: 0.09196, params: {'max_depth': 30, 'n_estimators': 300}, mean: -1.29912, std: 0.03608, params: {'max_depth': 30, 'n_estimators': 400}]
{'max_depth': 15, 'n_estimators': 300}
-0.986113396697




# TRAINING PHASE ALL ALGOS 

In [64]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5

    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("nw_scores/nw_stack_test/nw_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb
    
    #if os.path.isfile("nw_scores/nw_stack_train/nw_{}.csv".format(file)):
       # print("not necessary, already done")
    
    y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
    subm1 = pd.DataFrame(y_pred, columns=classes)
    subm1['ID'] = ID_train
    subm1.to_csv("nw_scores/nw_stack_train/nw_{}.csv".format(file),index=False)


In [68]:
dic_xgb={"xgb_d2v":XGBClassifier(colsample_bytree=0.8,max_depth=7,min_child_weight=3,n_estimators=100,subsample=0.8,objective="multi:softprob",seed=26),
        "xgb_tfidf":XGBClassifier(colsample_bytree=0.8,max_depth=5,min_child_weight=3,n_estimators=100,subsample=1,objective="multi:softprob",seed=26),
        "xgb_w2v":XGBClassifier(colsample_bytree=0.8,max_depth=5,min_child_weight=3,n_estimators=100,subsample=0.8,objective="multi:softprob",seed=26)}

dic_lgbm={"lgbm_d2v":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26),
        "lgbm_tfidf":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26),
        "lgbm_w2v":LGBMClassifier(boosting_type="gbdt",colsample_bytree=0.8,n_estimators=48,num_leaves=22,subsample=0.8,seed=26)}
dic_lr={"lr_d2v":LogisticRegression(C=10,penalty="l1"),
        "lr_tfidf":LogisticRegression(C=10,penalty="l1"),
        "lr_w2v":LogisticRegression(C=1,penalty="l1")}
dic_ada={"ada_d2v":AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm="SAMME.R", random_state=26),
        "ada_tfidf":AdaBoostClassifier(n_estimators=70, learning_rate=0.01, algorithm="SAMME.R", random_state=26),
        "ada_w2v":AdaBoostClassifier(n_estimators=50, learning_rate=0.01, algorithm="SAMME.R", random_state=26)}
dic_rf={"rf_d2v":RandomForestClassifier(n_estimators=400,max_depth=25,random_state=26),
        "rf_tfidf":RandomForestClassifier(n_estimators=400,max_depth=20,random_state=26),
        "rf_w2v":RandomForestClassifier(n_estimators=300,max_depth=15,random_state=26)}

print("xgboost here")
for clf,name in zip(dic_xgb.keys(),work_train_final.keys()):
    model_gen(X=np.array(work_train_final[name].drop("ID",axis=1)),X_test=np.array(work_test_final[name].drop("ID",axis=1)),y=y,classifier=dic_xgb[clf],file=clf)
print("lgbm here")
for clf,name in zip(dic_lgbm.keys(),work_train.keys()):
    model_gen(X=np.array(work_train_final[name].drop("ID",axis=1)),X_test=np.array(work_test_final[name].drop("ID",axis=1)),y=y,classifier=dic_lgbm[clf],file=clf)
print("logreg here")
for clf,name in zip(dic_lr.keys(),work_train.keys()):
    model_gen(X=np.array(work_train_final[name].drop("ID",axis=1)),X_test=np.array(work_test_final[name].drop("ID",axis=1)),y=y,classifier=dic_lr[clf],file=clf)
print("adaboost here")
for clf,name in zip(dic_ada.keys(),work_train.keys()):
    model_gen(X=np.array(work_train_final[name].drop("ID",axis=1)),X_test=np.array(work_test_final[name].drop("ID",axis=1)),y=y,classifier=dic_ada[clf],file=clf)
print("random forest here")
for clf,name in zip(dic_rf.keys(),work_train.keys()):
    model_gen(X=np.array(work_train_final[name].drop("ID",axis=1)),X_test=np.array(work_test_final[name].drop("ID",axis=1)),y=y,classifier=dic_rf[clf],file=clf)

ext_xgb=XGBClassifier(colsample_bytree=1,objective="multi:softprob",max_depth= 3,min_child_weight= 3, n_estimators=100,subsample= 0.8,seed=26)
model_gen(X=np.array(model_ext_train),X_test=np.array(model_ext_test),y=y,classifier=ext_xgb,file="xgb_external")

xgboost here
Fold 1 (2946, 137) (743, 137)
Fold 2 (2949, 137) (740, 137)
Fold 3 (2952, 137) (737, 137)
Fold 4 (2954, 137) (735, 137)
Fold 5 (2955, 137) (734, 137)
cross_val sur train 
Fold 1 (2946, 137) (743, 137)
Fold 2 (2949, 137) (740, 137)
Fold 3 (2952, 137) (737, 137)
Fold 4 (2954, 137) (735, 137)
Fold 5 (2955, 137) (734, 137)
cross_val sur train 
Fold 1 (2946, 138) (743, 138)
Fold 2 (2949, 138) (740, 138)
Fold 3 (2952, 138) (737, 138)
Fold 4 (2954, 138) (735, 138)
Fold 5 (2955, 138) (734, 138)
cross_val sur train 
lgbm here
Fold 1 (2946, 137) (743, 137)
Fold 2 (2949, 137) (740, 137)
Fold 3 (2952, 137) (737, 137)
Fold 4 (2954, 137) (735, 137)
Fold 5 (2955, 137) (734, 137)
cross_val sur train 
Fold 1 (2946, 137) (743, 137)
Fold 2 (2949, 137) (740, 137)
Fold 3 (2952, 137) (737, 137)
Fold 4 (2954, 137) (735, 137)
Fold 5 (2955, 137) (734, 137)
cross_val sur train 
Fold 1 (2946, 138) (743, 138)
Fold 2 (2949, 138) (740, 138)
Fold 3 (2952, 138) (737, 138)
Fold 4 (2954, 138) (735, 138)
Fo