In [9]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re

In [3]:
work_train = {} 
work_test = {}
pre_process=["w2v.npz","d2v.npz","tfidf.npz"]
path="checkpoints_databases/"
for f in pre_process:
    work_train[re.sub("\.npz","",f)] = ssp.load_npz(path+"nw_working_train_"+f)
    work_test[re.sub("\.npz","",f)] = ssp.load_npz(path+"nw_working_test_"+f)

In [4]:
y=pd.read_csv("..//bases/training_variants").Class.values - 1

In [5]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')
ID_train=train.ID  
ID_test=test.ID   
del train,test

In [10]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5
    else:
        print("One Fold predict")
        clf=classifier
        clf.fit(X,y)
        y_test=clf.predict_proba(X_test)
        print("One Fold done")
    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("nw_scores/nw_stack_test/nw_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb
    
    if os.path.isfile("nw_scores/nw_stack_train/nw_{}.csv".format(file)):
        print("not necessary, already done")
    else:
        y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba") 
        subm1 = pd.DataFrame(y_pred, columns=classes)
        subm1['ID'] = ID_train
        subm1.to_csv("nw_scores/nw_stack_train/nw_{}.csv".format(file),index=False)


# 3 Models of XGBOOST for each pre_processing

In [11]:
clf_xgb={"XGB_M":XGBClassifier(max_depth=5, objective="multi:softprob",subsample=0.7,seed=26),
    "XGB_S":XGBClassifier(max_depth=2,objective="multi:softprob",subsample=0.5,seed=26),
                   "XGB_T":XGBClassifier(max_depth=7,subsample=0.9,objective="multi:softprob",seed=26)}

In [12]:
for name in work_train:
    for clf in clf_xgb:
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=clf_xgb[clf],file=clf+"_"+name)

Fold 1 (2653, 115) (668, 115)
Fold 2 (2654, 115) (667, 115)
Fold 3 (2657, 115) (664, 115)


KeyboardInterrupt: 

# 3 Models of LGBM for each

In [None]:
clf_lgbm={"LGBM_S" : LGBMClassifier(num_leaves=25,bagging_fraction=0.6,
feature_fraction=0.6,application="multiclass",num_class=9),
"LGBM_M" : LGBMClassifier(num_leaves=40,bagging_fraction=0.8,feature_fraction=0.8,
        application="multiclass",num_class=9),
"LGBM_T" : LGBMClassifier(num_leaves=70,num_iterations=150,
        application="multiclass",num_class=9)}

In [7]:
for name in work_train:
    for clf in clf_lgbm:
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=clf_lgbm[clf],file=clf+"_"+name)

# 1 ADABOOST

In [15]:
clf_ada={"adaboost":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=None)}

In [None]:
for name in work_train:
    for clf in clf_ada:
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=clf_ada[clf],file=clf+"_"+name)
        

Fold 1 (2653, 115) (668, 115)
Fold 2 (2654, 115) (667, 115)
Fold 3 (2657, 115) (664, 115)
Fold 4 (2659, 115) (662, 115)
Fold 5 (2661, 115) (660, 115)
cross_val sur train 
Fold 1 (2653, 365) (668, 365)
Fold 2 (2654, 365) (667, 365)
Fold 3 (2657, 365) (664, 365)
Fold 4 (2659, 365) (662, 365)
Fold 5 (2661, 365) (660, 365)
cross_val sur train 
Fold 1 (2653, 10015) (668, 10015)
Fold 2 (2654, 10015) (667, 10015)
Fold 3 (2657, 10015) (664, 10015)
Fold 4 (2659, 10015) (662, 10015)
Fold 5 (2661, 10015) (660, 10015)
cross_val sur train 


# 1 logistic reg pén L2

In [6]:
clf_logit={"logit":LogisticRegression(penalty="l2")}

In [7]:
for name in work_train:
    for clf in clf_logit:
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=clf_logit[clf],file=clf+"_"+name)
        

Fold 1 (2653, 115) (668, 115)
Fold 2 (2654, 115) (667, 115)
Fold 3 (2657, 115) (664, 115)
Fold 4 (2659, 115) (662, 115)
Fold 5 (2661, 115) (660, 115)
cross_val sur train 
Fold 1 (2653, 365) (668, 365)
Fold 2 (2654, 365) (667, 365)
Fold 3 (2657, 365) (664, 365)
Fold 4 (2659, 365) (662, 365)
Fold 5 (2661, 365) (660, 365)
cross_val sur train 
Fold 1 (2653, 10015) (668, 10015)
Fold 2 (2654, 10015) (667, 10015)
Fold 3 (2657, 10015) (664, 10015)
Fold 4 (2659, 10015) (662, 10015)
Fold 5 (2661, 10015) (660, 10015)
cross_val sur train 
