In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier 



In [2]:
work_train = {} 
work_test = {}
pre_process=["d2v.npz","tfidf.npz","w2v.npz"]
path="checkpoints_databases/"
for f in pre_process:
    work_train[re.sub("\.npz","",f)] = ssp.load_npz(path+"w_working_train_"+f)
    work_test[re.sub("\.npz","",f)] = ssp.load_npz(path+"w_working_test_"+f)

In [3]:
y=pd.read_csv("..//bases/training_variants").Class.values - 1

In [4]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')
ID_train=train.ID
ID_test=test.ID
del train,test

# GRID SEARCH PHASE ALL ALGOS

In [None]:
clf_xgb=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5,7],
    "min_child_weight" : [1,3],
    "n_estimators" : [100,200],
    "subsample":[0.8,1],
    "colsample_bytree":[0.8,1]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_xgb, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v :
#tfidf:
#w2v:

In [None]:
clf_lgbm=LGBMClassifier(seed=26)
param_test= {
    'n_estimators': [8,24,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'colsample_bytree' : [0.7 0.8],
    'subsample' : [0.7,0.8]
    }
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_lgbm, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v :
#tfidf :
#w2v :

In [None]:
clf_log=LogisticRegression()
param_test= {
    "C" : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty" : ["l1","l2"]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_log, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : 
#tfidf :
#w2v :

In [None]:
clf_ada=AdaBoostClassifier(n_estimators=100, learning_rate=0.3, algorithm="SAMME.R", random_state=26)
param_test={
    "n_estimators":[50,70,100,200],
    "learning_rate":[0.01,0.05,0.1,0.2]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_ada, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : 0.3, 50
#tfidf :
#w2v :

d2v
[mean: -1.79454, std: 0.08478, params: {'learning_rate': 0.01, 'n_estimators': 50}, mean: -1.78789, std: 0.07000, params: {'learning_rate': 0.01, 'n_estimators': 70}, mean: -1.78056, std: 0.05693, params: {'learning_rate': 0.01, 'n_estimators': 100}, mean: -1.81905, std: 0.01875, params: {'learning_rate': 0.01, 'n_estimators': 200}, mean: -1.84589, std: 0.01660, params: {'learning_rate': 0.05, 'n_estimators': 50}, mean: -1.87861, std: 0.01914, params: {'learning_rate': 0.05, 'n_estimators': 70}, mean: -1.91375, std: 0.01905, params: {'learning_rate': 0.05, 'n_estimators': 100}, mean: -1.96137, std: 0.01531, params: {'learning_rate': 0.05, 'n_estimators': 200}, mean: -1.91975, std: 0.02222, params: {'learning_rate': 0.1, 'n_estimators': 50}, mean: -1.94301, std: 0.02609, params: {'learning_rate': 0.1, 'n_estimators': 70}, mean: -1.96273, std: 0.02375, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -1.98288, std: 0.01920, params: {'learning_rate': 0.1, 'n_estimators': 200



In [43]:
clf_dt=RandomForestClassifier()
param_test={
    "max_depth":[5,7,10,15],
    "n_estimators":[100,200,300]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_dt, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=3)
    gsearch.fit(work_train[name],y)
    print(name)
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
#d2v : 10 depth,200 estimators
#tfidf : 5, 300estimators
#w2v : 5,300

d2v
[mean: -1.78361, std: 0.02093, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.77747, std: 0.02786, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.77440, std: 0.02725, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.76385, std: 0.02609, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.76603, std: 0.02595, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.77238, std: 0.02767, params: {'max_depth': 7, 'n_estimators': 300}, mean: -1.76888, std: 0.03342, params: {'max_depth': 10, 'n_estimators': 100}, mean: -1.75712, std: 0.03732, params: {'max_depth': 10, 'n_estimators': 200}, mean: -1.76257, std: 0.03259, params: {'max_depth': 10, 'n_estimators': 300}, mean: -1.77456, std: 0.03925, params: {'max_depth': 15, 'n_estimators': 100}, mean: -1.76767, std: 0.02790, params: {'max_depth': 15, 'n_estimators': 200}, mean: -1.76766, std: 0.03124, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 10, 'n_estimators': 200}
-1.75711838017




tfidf
[mean: -1.86561, std: 0.03701, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.86745, std: 0.03923, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.86391, std: 0.03768, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.87300, std: 0.05001, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.87563, std: 0.04775, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.87819, std: 0.05083, params: {'max_depth': 7, 'n_estimators': 300}, mean: -1.90219, std: 0.07216, params: {'max_depth': 10, 'n_estimators': 100}, mean: -1.89380, std: 0.06002, params: {'max_depth': 10, 'n_estimators': 200}, mean: -1.89399, std: 0.06184, params: {'max_depth': 10, 'n_estimators': 300}, mean: -1.92326, std: 0.06816, params: {'max_depth': 15, 'n_estimators': 100}, mean: -1.91585, std: 0.06256, params: {'max_depth': 15, 'n_estimators': 200}, mean: -1.91666, std: 0.07094, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 5, 'n_estimators': 300}
-1.86390535737




w2v
[mean: -1.85875, std: 0.03622, params: {'max_depth': 5, 'n_estimators': 100}, mean: -1.86428, std: 0.03698, params: {'max_depth': 5, 'n_estimators': 200}, mean: -1.86257, std: 0.03822, params: {'max_depth': 5, 'n_estimators': 300}, mean: -1.92127, std: 0.07438, params: {'max_depth': 7, 'n_estimators': 100}, mean: -1.90939, std: 0.06173, params: {'max_depth': 7, 'n_estimators': 200}, mean: -1.90573, std: 0.05770, params: {'max_depth': 7, 'n_estimators': 300}, mean: -2.02065, std: 0.12283, params: {'max_depth': 10, 'n_estimators': 100}, mean: -2.01469, std: 0.12751, params: {'max_depth': 10, 'n_estimators': 200}, mean: -2.01241, std: 0.12977, params: {'max_depth': 10, 'n_estimators': 300}, mean: -2.21558, std: 0.16131, params: {'max_depth': 15, 'n_estimators': 100}, mean: -2.17433, std: 0.16675, params: {'max_depth': 15, 'n_estimators': 200}, mean: -2.15429, std: 0.16827, params: {'max_depth': 15, 'n_estimators': 300}]
{'max_depth': 5, 'n_estimators': 100}
-1.85875240029




# TRAINING PHASE ALL ALGOS 

In [8]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5

    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("w_scores/w_stack_test/w_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb
    
    if os.path.isfile("w_scores/w_stack_train/w_{}.csv".format(file)):
        print("not necessary, already done")
    else:
        y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
        subm1 = pd.DataFrame(y_pred, columns=classes)
        subm1['ID'] = ID_train
        subm1.to_csv("w_scores/w_stack_train/w_{}.csv".format(file),index=False)


In [None]:
dic_algos={"xgb_d2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_tfidf":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_w2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "lgbm_d2v":LGBMClassifier(seed=26),
        "lgbm_tfidf":LGBMClassifier(seed=26),
        "lgbm_w2v":LGBMClassifier(seed=26),
        "lr_d2v":LogisticRegression(),
        "lr_tfidf":LogisticRegression(),
        "lr_w2v":LogisticRegression(),
        "ada_d2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_tfidf":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_w2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "rf_d2v":RandomForestClassifier(n_estimators=200,max_depth=10,random_state=26),
        "rf_tfidf":RandomForestClassifier(n_estimators=300,max_depth=5,random_state=26),
        "rf_w2v":RandomForestClassifier(n_estimators=300,max_depth=5,random_state=26)
          }

for clf in dic_algos:
    for name in work_train:
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_algos[clf],file=clf)

In [19]:
# just a try 
model_gen(X=work_train["w2v"],X_test=work_test["w2v"],y=y,classifier=RandomForest(),file="test_rf")

Fold 1 (2653, 479) (668, 479)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').