In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df=pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cancer...
2,2,CBL,Q249E,2,abstract background non small cell lung cancer...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


In [4]:
df['Gene'].unique().shape

(264,)

In [5]:
X=df.copy()

In [6]:
X=X.drop(['Class'],axis=1)

In [7]:
X=X.drop(['ID'],axis=1)

In [8]:
Y=df['Class']

In [9]:
df.TEXT.astype('str')

0       cyclin dependent kinases cdks regulate variety...
1       abstract background non small cell lung cancer...
2       abstract background non small cell lung cancer...
3       recent evidence demonstrated acquired uniparen...
4       oncogenic mutations monomeric casitas b lineag...
                              ...                        
3316    introduction myelodysplastic syndromes mds het...
3317    introduction myelodysplastic syndromes mds het...
3318    runt related transcription factor gene runx al...
3319    runx aml gene frequent target chromosomal tran...
3320    frequent mutations associated leukemia recurre...
Name: TEXT, Length: 3321, dtype: object

In [10]:
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df['TEXT'] = df['TEXT'].astype('str').apply(clean_text)
df['TEXT'] = df['TEXT'].astype('str').str.replace('\d+', '')

In [11]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 2000
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['TEXT'].astype('str').values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 91934 unique tokens.


In [12]:
X_Text = tokenizer.texts_to_sequences(df['TEXT'].astype('str').values)
X_Text = pad_sequences(X_Text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_Text.shape)

Shape of data tensor: (3321, 250)


In [13]:
import pickle

In [14]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [15]:
# load the model from disk
filename = 'tokenizer.pkl'
to = pickle.load(open(filename, 'rb'))

In [16]:
txt = to.texts_to_sequences(['Abstract Background  Non-small cell lung cancer (NSCLC) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-CBL is an'])
txt  = pad_sequences(txt, maxlen=MAX_SEQUENCE_LENGTH)
txt.shape

(1, 250)

In [17]:
Y_Text = pd.get_dummies(df['Class']).values
print('Shape of label tensor:', Y_Text.shape)

Shape of label tensor: (3321, 9)


In [18]:
df.groupby(df['Class']).count()

Unnamed: 0_level_0,ID,Gene,Variation,TEXT
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,568,568,568,568
2,452,452,452,452
3,89,89,89,89
4,686,686,686,686
5,242,242,242,242
6,275,275,275,275
7,953,953,953,953
8,19,19,19,19
9,37,37,37,37


In [19]:
X_train_text, X_test_text, Y_train_text, Y_test_text = train_test_split(X_Text,Y_Text, test_size = 0.1, random_state = 42)
print(X_train_text.shape,Y_train_text.shape)
print(X_test_text.shape,Y_test_text.shape)

(2988, 250) (2988, 9)
(333, 250) (333, 9)


In [20]:
from tensorflow.python.keras.layers import LSTM, Dense,Embedding,SpatialDropout1D,Conv1D, MaxPooling1D, Dropout
from tensorflow.python.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
#from keras.callbacks import ModelCheckpoint, EarlyStopping

In [21]:
# Convolution
kernel_size = 70
filters = 128
pool_size = 4

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_Text.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
#model.add(Dropout(0.5))
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dropout(0.5))
# model.add(Dense(100,activation='relu'))
model.add(Dense(9, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 30
batch_size = 64

history = model.fit(X_train_text, Y_train_text, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 2689 samples, validate on 299 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


In [22]:
accr = model.evaluate(X_test_text,Y_test_text)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
accr = model.evaluate(X_train_text,Y_train_text)
print('Train set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.199
  Accuracy: 0.631
Train set
  Loss: 0.534
  Accuracy: 0.805


In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 250, 100)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 181, 128)          896128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 45, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                35800     
_________________________________________________________________
dense (Dense)                (None, 9)                 459       
Total params: 5,932,387
Trainable params: 5,932,387
Non-trainable params: 0
______________________________________________

In [24]:
"""import json
model_json = model.to_json()
with open("model_63_80.json", "w") as json_file:
    json_file.dump(model_json)
# serialize weights to HDF5
model.save_weights("model_63_80.h5")"""

'import json\nmodel_json = model.to_json()\nwith open("model_63_80.json", "w") as json_file:\n    json_file.dump(model_json)\n# serialize weights to HDF5\nmodel.save_weights("model_63_80.h5")'

In [25]:
import json

# lets assume `model` is main model 
model_json = model.to_json()
with open("model_63_80.json", "w") as json_file:
    json.dump(model_json, json_file)
    
# serialize weights to HDF5
model.save_weights("model_63_80.h5")

In [26]:
from keras.models import model_from_json
# serialize model to JSON
model_json = model.to_json()
with open("model_63_80.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_63_80.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model_63_80.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_63_80.h5")
print("Loaded model from disk")

Saved model to disk


ValueError: Unknown initializer: GlorotUniform

In [None]:
# MLP for Pima Indians Dataset serialize to YAML and HDF5
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_yaml
import numpy
import os

# serialize model to YAML
model_yaml = model.to_yaml()
with open("model_63_80.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("model_63_80.h5")
print("Saved model to disk")

In [None]:
# load YAML and create model
yaml_file = open('model_63_80.yaml', 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()


with CustomObjectScope({'GlorotUniform': glorot_uniform()}):
        model = load_model('model_63_80.h5')

In [None]:
model.summary()

In [None]:
import tensorflow as tf

In [None]:
tf.__version__

In [None]:
from keras import backend as K

inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functor = K.function([inp, K.learning_phase()], outputs )   # evaluation function

# Testing
input_shape=X_Text.shape[1]
# test = np.random.random(input_shape)[np.newaxis,...]
test=X_train_text[0][np.newaxis,...]
layer_outs = functor([test, 1.])
print(layer_outs[4].shape)

In [None]:
np.random.random(input_shape)[np.newaxis,...].shape

## Text representations from lstm model

In [None]:
d={}
for i in range(50):
    d[i]=[]

In [None]:
# d={'text_representations_63_80':[]}
for tokens in X_Text:
    test=tokens[np.newaxis,...]
    layer_outs = functor([test, 1.])
    l=layer_outs[4].flatten().tolist()
    for i,rep in enumerate(l):
        d[i].append(l[i])

In [None]:
pd.DataFrame(d).to_csv('text_rep_lstm.csv',index=False)

In [None]:
X=X.drop(['TEXT'],axis=1)

In [None]:
X_dummies=pd.get_dummies(X)

In [None]:
X_dummies.shape

In [None]:
df_new=pd.concat([X_dummies,pd.DataFrame(d)],axis=1)

In [None]:
df_new.shape

In [None]:
df_new.to_csv('data/transormed_1.csv',index=False)

In [None]:
Y.to_csv('data/labels.csv',index=False)

## SVD for decomposition of whole components

In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD

# A = np.array([[-1, 2, 0], [2, 0, -2], [0, -2, 1]])
# print("Original Matrix:")
# print(A)

svd =  TruncatedSVD(n_components = 1000)
A_transf = svd.fit_transform(df_new)

# print("Singular values:")
# print(svd.singular_values_)

print("Transformed Matrix after reducing to 2 features:")
print(A_transf.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df_new,Y,test_size=0.2,random_state=0)

In [None]:
x_train.shape

In [None]:
rf

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(min_samples_split=5,min_impurity_split=.2,n_estimators=100,max_leaf_nodes=2500,
                          max_depth=15)
rf.fit(x_train,y_train)

In [None]:
print("Train Accuracy:",rf.score(x_train,y_train))
print("Test Accuracy:",rf.score(x_test,y_test))

In [None]:
!pip install xgboost

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from xgboost import XGBClassifier
xgb=XGBClassifier(n_estimators=500,learning_rate=0.4)
xgb.fit(x_train,y_train)

In [None]:
print("Train accuracy of xgb:",accuracy_score(xgb.predict(x_train),y_train))
print("Test accuracy of xgb:",accuracy_score(xgb.predict(x_test),y_test))

In [None]:
import pickle
pickle.dump(rf,open('model/rf_text_representation_lstm_50_78_99','wb'))

In [None]:
rf

## word2vec 

In [None]:
from gensim.models import Word2Vec

In [None]:
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(l, min_count=1,size=1)
# summarize the loaded model
# print(model)
# summarize vocabulary
words = list(model.wv.vocab)
# print(words)
# access vector for one word
# print(model['sentence'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

In [None]:
l

In [None]:
model.wv[l[0]].shape

## Word2Vec pretrained model

In [None]:
from gensim.models import Word2Vec

In [None]:
from nltk.tokenize import word_tokenize
word_tokenize('hi how are you?. I am fine.')

In [None]:
sentence=[['Neeraj','Boy'],['Sarwan','is'],['good','boy']]
model = Word2Vec(sentence, min_count=1,size=300,workers=4)
print(model.similarity('good', 'boy'))

In [None]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [None]:
s='Hello everyone. Welcome to GeeksforGeeks. You are studying NLP article'

In [None]:
word_tokenize(sent_tokenize(df['TEXT'].astype('str').values[0]))

In [None]:
l=[]
for i in range(df.shape[0]):
    l.append(word_tokenize(df['TEXT'].astype('str').values[i]))
    print(i)

In [None]:
len(l)

In [None]:
len(l)