In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as ssp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm
from lightgbm.sklearn import LGBMClassifier
import os 
import re
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC



In [2]:
work_train = {} 
work_test = {}
pre_process=["w2v.npz","d2v.npz","tfidf.npz"]
path="checkpoints_databases/"
for f in pre_process:
    work_train[re.sub("\.npz","",f)] = ssp.load_npz(path+"nw_working_train_"+f)
    work_test[re.sub("\.npz","",f)] = ssp.load_npz(path+"nw_working_test_"+f)

In [3]:
y=pd.read_csv("..//bases/training_variants").Class.values - 1

In [4]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')
ID_train=train.ID
ID_test=test.ID
del train,test

# GRID SEARCH PHASE ALL ALGOS

In [None]:
clf_xgb=XGBClassifier(max_depth=3, objective="multi:softprob",seed=26)
param_test= {
    "max_depth" : [3,5],
    "min_child_weight" : [1,3],
    "n_estimators" : [100],
    "subsample":[0.8,1],
    "colsample_bytree":[0.8,1]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_xgb, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

In [None]:
clf_lgbm=LGBMClassifier(seed=26)
param_test= {
    'n_estimators': [8,24,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'colsample_bytree' : [0.7 0.8],
    'subsample' : [0.7,0.8]
    }
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_lgbm, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

In [None]:
clf_log=LogisticRegression()
param_test= {
    "C" : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty" : ["l1","l2"]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_log, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

In [None]:
clf_ada=AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26)
param_test={
    "n_estimators":[50,100],
    "learning_rate":[0.3,0.5,0.9,1]
}
for name in work_train:
    gsearch=GridSearchCV(estimator=clf_ada, param_grid = param_test,scoring="neg_log_loss",n_jobs=4,iid=False, cv=5)
    gsearch.fit(work_train[name],y)
    gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

In [None]:
#no grid for svm too long

# TRAINING PHASE ALL ALGOS 

In [5]:
def model_gen(X,X_test,y,classifier,file,five_fold_predict=True):
    #if not os.path.exists("scores/"+file):
    #   os.makedirs("scores/"+file)
    kf = model_selection.StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    if five_fold_predict:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
        
            fold += 1

            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)

            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5

    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("nw_scores/nw_stack_test/nw_{}.csv".format(file),index=False)
    
    print("cross_val sur train ") #peut etre que to array est exclusivement pour les xgb
    
    if os.path.isfile("nw_scores/nw_stack_train/nw_{}.csv".format(file)):
        print("not necessary, already done")
    else:
        y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
        subm1 = pd.DataFrame(y_pred, columns=classes)
        subm1['ID'] = ID_train
        subm1.to_csv("nw_scores/nw_stack_train/nw_{}.csv".format(file),index=False)


In [6]:
dic_algos=["xgb_d2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_tfidf":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "xgb_w2v":XGBClassifier(max_depth=3,objective="multi:softprob",seed=26),
        "lgbm_d2v":LGBMClassifier(seed=26),
        "lgbm_tfidf":LGBMClassifier(seed=26),
        "lgbm_w2v":LGBMClassifier(seed=26),
        "lr_d2v":LogisticRegression(),
        "lr_tfidf":LogisticRegression(),
        "lr_w2v":LogisticRegression(),
        "ada_d2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_tfidf":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "ada_w2v":AdaBoostClassifier(n_estimators=100, learning_rate=1.0, algorithm="SAMME.R", random_state=26),
        "svm_d2v":SVC(probability=True,random_state=26),
        "svm_tfidf":SVC(probability=True,random_state=26),
        "svm_w2v":SVC(probability=True,random_state=26)
          ]

for clf in dic_algos:
    for name in work_train:
        model_gen(X=work_train[name],X_test=work_test[name],y=y,classifier=dic_algos[clf],file=clf)

SyntaxError: invalid syntax (<ipython-input-6-bc7954f28cda>, line 1)

In [None]:
model_gen(X=work_train["w2v"],X_test=work_test["w2v"],y=y,classifier=SVC(kernel="linear",probability=True,random_state=26),file="test_svm")

Fold 1 (2653, 379) (668, 379)
