In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from keras.models import Sequential
from keras.layers import Dense,Flatten,Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier
# fix random seed for reproducibility
np.random.seed(26)

Using TensorFlow backend.


In [2]:
train = pd.read_csv('bases/training_variants')
test = pd.read_csv('bases/test_variants')
train_texts = pd.read_csv('bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [3]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [4]:
#### process the train and test set together
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [5]:
stop = set(stopwords.words('english'))
exclude = set('!"#$%&\'()*+:;<=>?@[\\]^_`{|}~0123456789') 
lemma = WordNetLemmatizer()
def clean(doc,lemmatiz=False):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free_0 = [re.sub(",|\.|/"," ",ch) for ch in stop_free]
    if lemmatiz:
        punc_free_lem="".join(ch for ch in punc_free_0 if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free_lem.split())
        return normalized
    else:
        punc_free = "".join(ch for ch in punc_free_0 if ch not in exclude)
        return punc_free

In [8]:
#No lemmatization for the moment, be careful not to lemmatize then w2vec
data_all.Text = [clean(doc,lemmatiz=True) for doc in data_all.Text]  

In [9]:
ID_train=train.ID
ID_test=test.ID

In [10]:
train = data_all.iloc[:len(train)]
test = data_all.iloc[len(train):]

In [27]:
y=train["Class"]-1
X_train=train.drop(["Class","ID","Gene","Variation"],axis=1)
X_test=test.drop(["Class","ID","Gene","Variation"],axis=1)
txt_no_dup=train["Text"].drop_duplicates()

In [12]:
tfidf = TfidfVectorizer(
        min_df=10, max_features=15000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(txt_no_dup)

In [13]:
X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [14]:
list_comp=[10,20,30,40,50,60,80,100,120,150,170,200]
dic_svd={}
for comp in list_comp:
    dic_svd[comp]=TruncatedSVD(n_components=comp,n_iter=10,random_state=26)
tsvd_train,tsvd_test = {},{}
for sv in dic_svd:
    tsvd_train[sv]=dic_svd[sv].fit_transform(X_train_text)
    tsvd_test[sv]=dic_svd[sv].transform(X_test_text)

In [15]:
for n in dic_svd:
    for i in range(n):
        X_train['tsvd_' +str(n)+"_"+str(i)] = tsvd_train[n][:, i]
        X_test['tsvd_' +str(n)+"_"+str(i)] = tsvd_test[sv][:, i]

In [16]:
X_train_0=np.array(X_train.drop("Text",axis=1))
X_test_0=np.array(X_test.drop("Text",axis=1))

In [None]:
#LSTM classifier

In [17]:
# create the model
def model_ann():
    model = Sequential()
    model.add(Dense(300,input_dim=1030,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(300,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(300,activation="relu"))
    model.add(Dense(9,activation="softmax"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [19]:
dic_classifier={"XGB_medium":XGBClassifier(n_estimators=200,max_depth=5, objective="multi:softprob",subsample=0.7,seed=26),
    "XGB_small":XGBClassifier(max_depth=2,objective="multi:softprob",subsample=0.5,seed=26),
                   "XGB_tall":XGBClassifier(n_estimators=300,max_depth=7,subsample=0.9,objective="multi:softprob",seed=26)}

In [20]:
def model_gen_nowdw(X,X_test,y,classifier,file,neural_net=False):

    kf = StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    if neural_net==False:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
            fold += 1
            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)
            clf=classifier
            clf.fit(X_train,y_train)
            y_test = clf.predict_proba(X_test)
    else:
        print("One Fold predict for NN")
        clf=classifier
        clf.fit(X,y)
        y_test=clf.predict_proba(X_test)
    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("scores/stack_test/nowdw_{}_new.csv".format(file),index=False)
    
    print("cross_val sur train") #peut etre que to array est exclusivement pour les xgb
    
    if os.path.isfile("scores/stack_train/nowdw_{}_new.csv".format(file)):
        print("not necessary, already done")
    else:
        if neural_net==False:
            y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba")
        else:
            y_pred=clf.predict_proba(X)
        subm1 = pd.DataFrame(y_pred, columns=classes)
        subm1['ID'] = ID_train
        subm1.to_csv("scores/stack_train/nowdw_{}_new.csv".format(file),index=False)


In [20]:
for clf in dic_classifier:
    model_gen_nowdw(X_train_0,X_test_0,y,dic_classifier[clf],file=clf)

Fold 1 (2653, 1030) (668, 1030)
Fold 2 (2654, 1030) (667, 1030)
Fold 3 (2657, 1030) (664, 1030)
Fold 4 (2659, 1030) (662, 1030)
Fold 5 (2661, 1030) (660, 1030)
cross_val sur train
Fold 1 (2653, 1030) (668, 1030)
Fold 2 (2654, 1030) (667, 1030)
Fold 3 (2657, 1030) (664, 1030)
Fold 4 (2659, 1030) (662, 1030)
Fold 5 (2661, 1030) (660, 1030)
cross_val sur train
Fold 1 (2653, 1030) (668, 1030)
Fold 2 (2654, 1030) (667, 1030)
Fold 3 (2657, 1030) (664, 1030)
Fold 4 (2659, 1030) (662, 1030)
Fold 5 (2661, 1030) (660, 1030)
cross_val sur train


# REPEATABLE NEURAL NETWORK

In [50]:
from keras.callbacks import EarlyStopping,ModelCheckpoint

In [130]:
X_train_0.shape

(3321, 1030)

In [132]:
def fit_model_nn(X, y):
    model = Sequential()
    model.add(Dense(300,input_dim=1030,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(300,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(300,activation="relu"))
    model.add(Dense(9,activation="softmax"))
    epochs=200
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    es=EarlyStopping(monitor="loss",mode="min",patience=10)
    checkpoint=ModelCheckpoint(filepath="checkann",save_best_only=True,monitor="loss",mode="min")
    model.fit(X, y,epochs=100,batch_size=50,callbacks=[es,checkpoint],validation_split=0.2)
    model.load_weights("checkann")
    y_test=model.predict_proba(X_test_0)
    return y_test

In [133]:
y_cat=to_categorical(y,9)

In [111]:
# define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=26)
cvscores = []
for train, test in kfold.split(X_train_0, y):
  # create model
    model = Sequential()
    model.add(Dense(100, input_dim=1030, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(300,activation="relu"))
    model.add(Dense(9, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    es=EarlyStopping(monitor="loss",mode="min",patience=10)
    checkpoint=ModelCheckpoint(filepath="checknn",save_best_only=True,monitor="loss",mode="min")
    model.fit(X_train_0[train], y_cat[train],epochs=100,batch_size=50,validation_split=0.2,callbacks=[es,checkpoint],verbose=0)
    model.load_weights("checknn")
    # evaluate the model
    scores = model.evaluate(X_train_0[test], y_cat[test])
    print("%s: %.2f" % (model.metrics_names[0], scores[0]))
    cvscores.append(scores[0] )
print("%.2f (+/- %.2f)" % (np.mean(cvscores), np.std(cvscores)))

2.22 (+/- 0.25)


In [124]:
classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
subm = pd.DataFrame(y_sol, columns=classes)
subm['ID'] = ID_test
subm.to_csv("scores/stack_test/neural_net_overfitting.csv",index=False)

In [None]:
#repeating
repeats=10
scores={}
for i in range(repeats):
    scores["rep_"+str(i)]=fit_model_nn(X_train_0,to_categorical(y,9))

Train on 2656 samples, validate on 665 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100


Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100


Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100