In [26]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense,Flatten,Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier
# fix random seed for reproducibility
np.random.seed(26)

In [2]:
cwd = os.getcwd()
train = pd.read_csv(cwd + '/bases/training_variants')
test = pd.read_csv(cwd + '/bases/test_variants')

In [3]:
train_texts = pd.read_csv(cwd + '/bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv(cwd + '/bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [4]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [5]:
#### process the train and test set together
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [6]:
stop = set(stopwords.words('english'))
exclude = set('!"#$%&\'()*+:;<=>?@[\\]^_`{|}~0123456789') 
lemma = WordNetLemmatizer()
def clean(doc,lemmatiz=False):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free_0 = [re.sub(",|\.|/"," ",ch) for ch in stop_free]
    if lemmatiz:
        punc_free_lem="".join(ch for ch in punc_free_0 if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free_lem.split())
        return normalized
    else:
        punc_free = "".join(ch for ch in punc_free_0 if ch not in exclude)
        return punc_free

In [7]:
#No lemmatization for the moment, be careful not to lemmatize then w2vec
data_all.Text = [clean(doc) for doc in data_all.Text]  

In [8]:
train = data_all.iloc[:len(train)]
test = data_all.iloc[len(train):]

In [9]:
y=train["Class"]-1
X_train=train.drop(["Class","ID","Gene","Variation"],axis=1)
X_test=test.drop(["Class","ID","Gene","Variation"],axis=1)
txt_no_dup=train["Text"].drop_duplicates()

In [10]:
tfidf = TfidfVectorizer(
        min_df=10, max_features=15000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(txt_no_dup)

In [11]:
X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [12]:
list_comp=[10,20,30,40,50,60,80,100,120,150,170,200]
dic_svd={}
for comp in list_comp:
    dic_svd[comp]=TruncatedSVD(n_components=comp,n_iter=10,random_state=26)
tsvd_train,tsvd_test = {},{}
for sv in dic_svd:
    tsvd_train[sv]=dic_svd[sv].fit_transform(X_train_text)
    tsvd_test[sv]=dic_svd[sv].transform(X_test_text)

In [13]:
for n in dic_svd:
    for i in range(n):
        X_train['tsvd_' +str(n)+"_"+str(i)] = tsvd_train[n][:, i]
        X_test['tsvd_' +str(n)+"_"+str(i)] = tsvd_test[sv][:, i]

In [14]:
X_train_0=np.array(X_train.drop("Text",axis=1))
X_test_0=np.array(X_test.drop("Text",axis=1))

In [15]:
#LSTM classifier

In [16]:
y_ctg=to_categorical(y,9)

In [21]:
# create the model
model = Sequential()
model.add(Dense(300,input_dim=1030,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(300,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(300,activation="relu"))
model.add(Dense(9,activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_0,y_ctg,epochs=40,batch_size=50)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x18ad18e80>

In [27]:
dic_classifier={"XGB_medium":XGBClassifier(n_estimators=200,max_depth=5, objective="multi:softprob",subsample=0.7,seed=26),
    "XGB_small":XGBClassifier(max_depth=2,objective="multi:softprob",subsample=0.5,seed=26),
                   "XGB_tall":XGBClassifier(n_estimators=300,max_depth=7,subsample=0.9,objective="multi:softprob",seed=26),
               "Neural Net": model}

In [28]:
def model_gen_nowdw(X,X_test,y,classifier,file,neural_net=False):

    kf = StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    if neural_net==False:
        fold = 0
        y_test=0
        for train_index, test_index in kf.split(X, y):
            fold += 1
            X_train, X_valid    = X[train_index],   X[test_index]
            y_train, y_valid    = y[train_index],   y[test_index]

            print("Fold", fold, X_train.shape, X_valid.shape)
            clf=classifier
            clf.fit(X_train,y_train)
            p_test = clf.predict_proba(X_test)
            y_test += p_test/5
    else:
        print("One Fold predict for NN")
        clf=classifier
        y_test=clf.predict(X_test)
    classes = "class1,class2,class3,class4,class5,class6,class7,class8,class9".split(',')
    subm = pd.DataFrame(y_test, columns=classes)
    subm['ID'] = ID_test
    
    subm.to_csv("scores/stack_test/nowdw_{}.csv".format(file),index=False)
    
    print("cross_val sur train") #peut etre que to array est exclusivement pour les xgb
    
    if os.path.isfile("scores/stack_train/nowdw_{}.csv".format(file)):
        print("not necessary, already done")
    else:
        y_pred=cross_val_predict(estimator=clf,X=X,y=y,cv=kf,method="predict_proba") 
        subm1 = pd.DataFrame(y_pred, columns=classes)
        subm1['ID'] = ID_train
        subm1.to_csv("scores/stack_train/nowdw_{}.csv".format(file),index=False)


In [29]:
for clf in dic_classifier:
    model_gen_nowdw(X_train_0,X_test_0,y,dic_classifier[clf],file=clf)

Fold 1 (2653, 1030) (668, 1030)


KeyboardInterrupt: 