In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
import xgboost

import string

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector, Bidirectional
from keras.optimizer_v1 import SGD
from tensorflow import keras

In [3]:
import warnings
warnings.filterwarnings(action = "ignore")

In [4]:
dfTrain = pd.read_csv("training_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
dfTrainVariant = pd.read_csv("training_variants")

In [5]:
train = pd.merge(dfTrain,dfTrainVariant, how = "inner", on = "ID")

In [6]:
train.head()

Unnamed: 0,ID,Text,Gene,Variation,Class
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,2
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,3
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4


In [7]:
train.isna().sum()

ID           0
Text         5
Gene         0
Variation    0
Class        0
dtype: int64

In [8]:
train.set_index("ID")
train.dropna(inplace=True)

In [9]:
train.shape

(3316, 5)

In [10]:
stop_words = set(stopwords.words("english"))

In [11]:
def preprocessamento(text):
    global stop_words
    text = text.lower()
    text = text.translate(str.maketrans("","", string.punctuation))
    
    return text

In [12]:
train["Text2"] = train["Text"].map(preprocessamento)

In [13]:
tfidf =TfidfVectorizer(min_df = 1, ngram_range=(1,2), max_features=500)

In [14]:
text_train = tfidf.fit_transform(train["Text2"].values).toarray()

In [15]:
train2 = pd.DataFrame(text_train, index = train.index)

In [16]:
svd_truncated = TruncatedSVD(n_components=70)
truncated_train = pd.DataFrame(svd_truncated.fit_transform(train2))
truncated_train["ID"] = train["ID"]
truncated_train.set_index("ID")
truncated_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,ID
0,0.843649,-0.046907,0.00154,0.029399,0.024141,0.052501,-0.064564,-0.038035,-0.002667,-0.07945,...,-0.046292,-0.001439,0.028267,-0.003859,-0.036955,0.027883,-0.003614,-0.018703,-0.006943,0.0
1,0.923306,-0.109443,-0.080396,-0.084001,0.060329,-0.040353,0.081776,0.005878,-0.000696,-0.065425,...,0.02827,-0.004321,-0.022104,-0.01671,-0.026,0.030286,-0.018505,0.013953,-0.014941,1.0
2,0.923306,-0.109443,-0.080396,-0.084001,0.060329,-0.040353,0.081776,0.005878,-0.000696,-0.065425,...,0.02827,-0.004321,-0.022104,-0.01671,-0.026,0.030286,-0.018505,0.013953,-0.014941,2.0
3,0.926012,-0.082498,-0.072305,-0.060691,-0.004379,0.041465,-0.0549,0.013517,-0.023545,-0.013946,...,-0.012383,-0.072557,0.010622,-0.044503,-0.020163,-0.015679,-0.044632,-0.010895,-0.013575,3.0
4,0.934875,-0.02359,0.018452,-0.069885,0.075806,-0.05934,0.021786,-0.02212,0.006394,-0.039436,...,-0.030594,0.001232,0.034414,0.025017,0.02567,0.013742,-0.032502,-0.029031,-0.017881,4.0


In [17]:
one_hot_enc_gene_var = pd.get_dummies(train,columns = ["Gene","Variation"],drop_first=True)
one_hot_enc_gene_var.drop(["Text","Text2","Class"], axis = 1, inplace = True)
one_hot_enc_gene_var.head()

Unnamed: 0,ID,Gene_ACVR1,Gene_AGO2,Gene_AKT1,Gene_AKT2,Gene_AKT3,Gene_ALK,Gene_APC,Gene_AR,Gene_ARAF,...,Variation_Y87N,Variation_Y901C,Variation_Y931C,Variation_Y98H,Variation_Y98N,Variation_YAP1-FAM118B Fusion,Variation_YAP1-MAMLD1 Fusion,Variation_ZC3H7B-BCOR Fusion,Variation_ZNF198-FGFR1 Fusion,Variation_p61BRAF
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
one_hot_enc_gene_var = one_hot_enc_gene_var.set_index("ID")
train = train.set_index("ID")

In [19]:
one_hot_enc_gene_var.head()

Unnamed: 0_level_0,Gene_ACVR1,Gene_AGO2,Gene_AKT1,Gene_AKT2,Gene_AKT3,Gene_ALK,Gene_APC,Gene_AR,Gene_ARAF,Gene_ARID1A,...,Variation_Y87N,Variation_Y901C,Variation_Y931C,Variation_Y98H,Variation_Y98N,Variation_YAP1-FAM118B Fusion,Variation_YAP1-MAMLD1 Fusion,Variation_ZC3H7B-BCOR Fusion,Variation_ZNF198-FGFR1 Fusion,Variation_p61BRAF
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
train.drop(["Text","Gene","Variation"],axis =1, inplace = True)
train.head()

Unnamed: 0_level_0,Class,Text2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,cyclindependent kinases cdks regulate a variet...
1,2,abstract background nonsmall cell lung cance...
2,2,abstract background nonsmall cell lung cance...
3,3,recent evidence has demonstrated that acquired...
4,4,oncogenic mutations in the monomeric casitas b...


In [21]:
#train = pd.merge(dfTrain,dfTrainVariant, how = "inner", on = "ID")
train2 = pd.merge(truncated_train,one_hot_enc_gene_var, how = 'inner', on = "ID")
train2["Class"] = train["Class"]
train2.dropna(inplace = True)
train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Variation_Y901C,Variation_Y931C,Variation_Y98H,Variation_Y98N,Variation_YAP1-FAM118B Fusion,Variation_YAP1-MAMLD1 Fusion,Variation_ZC3H7B-BCOR Fusion,Variation_ZNF198-FGFR1 Fusion,Variation_p61BRAF,Class
0,0.843649,-0.046907,0.00154,0.029399,0.024141,0.052501,-0.064564,-0.038035,-0.002667,-0.07945,...,0,0,0,0,0,0,0,0,0,1.0
1,0.923306,-0.109443,-0.080396,-0.084001,0.060329,-0.040353,0.081776,0.005878,-0.000696,-0.065425,...,0,0,0,0,0,0,0,0,0,2.0
2,0.923306,-0.109443,-0.080396,-0.084001,0.060329,-0.040353,0.081776,0.005878,-0.000696,-0.065425,...,0,0,0,0,0,0,0,0,0,2.0
3,0.926012,-0.082498,-0.072305,-0.060691,-0.004379,0.041465,-0.0549,0.013517,-0.023545,-0.013946,...,0,0,0,0,0,0,0,0,0,3.0
4,0.934875,-0.02359,0.018452,-0.069885,0.075806,-0.05934,0.021786,-0.02212,0.006394,-0.039436,...,0,0,0,0,0,0,0,0,0,4.0


In [22]:
X = train2.drop("Class", axis=1)
y = train2.Class

X_train, X_test, y_train, y_test  = train_test_split(X,y, test_size=0.20,shuffle=True)

In [23]:
modelXGB = xgboost.XGBClassifier()
modelXGB.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [24]:
predXGBTest = modelXGB.predict(X_test)
metrics.accuracy_score(y_test,predXGBTest)

0.5226586102719033

In [25]:
predXGBTrain = modelXGB.predict(X_train)
metrics.accuracy_score(y_train,predXGBTrain)

0.966338880484115

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [27]:
model = rf.fit(X_train,y_train)

In [28]:
predTest = model.predict(X_test)
metrics.accuracy_score(y_test,predTest)

0.48036253776435045

In [29]:
predTrain = model.predict(X_train)
metrics.accuracy_score(y_train,predTrain)

1.0

In [30]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

In [31]:
modelTree = tree.fit(X_train,y_train)
predTest = modelTree.predict(X_test)
predTrain = modelTree.predict(X_train)
metrics.accuracy_score(y_test, predTest), metrics.accuracy_score(y_train,predTrain)

(0.4169184290030212, 1.0)

In [32]:
X_train.shape

(2644, 3324)

In [33]:
def baseline_model():
    model = Sequential()
    model.add(Dense(512, input_dim=3324, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1, activation="softmax"))
    model.compile(loss='poisson', optimizer='sgd', metrics=['accuracy'])
    return model

In [34]:
modelNN = baseline_model()

In [35]:
estimatorNN = modelNN.fit(X_train,y_train,epochs = 10, validation_split=0.2, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
estimatorNN.history

{'loss': [0.9999996423721313,
  0.9999996423721313,
  0.9999996423721313,
  0.9999997615814209,
  0.9999996423721313,
  0.9999996423721313,
  0.9999996423721313,
  0.9999996423721313,
  0.9999997615814209,
  0.9999997615814209],
 'accuracy': [0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769,
  0.1773049682378769],
 'val_loss': [0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522,
  0.9999994039535522],
 'val_accuracy': [0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888,
  0.13988657295703888]}

In [37]:
max_features = 3324

In [38]:
inputs = keras.Input(shape=(None,), dtype="float32")

x = Embedding(max_features, 256)(inputs)

x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128))(x)

outputs = Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 256)         850944    
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        394240    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              394240    
 nal)                                                            
                                                                 
 dense_5 (Dense)             (None, 1)                 257       
                                                                 
Total params: 1,639,681
Trainable params: 1,639,681
Non-train

In [39]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics = ["accuracy"])

In [40]:
model.fit(X_train, y_train, batch_size=32, epochs=2, validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x269e866e6c8>

In [41]:
modelXGB.save_model("Modelo1.json")

In [43]:
model.save("modelKeras")



INFO:tensorflow:Assets written to: modelKeras\assets


INFO:tensorflow:Assets written to: modelKeras\assets
