In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df=pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cancer...
2,2,CBL,Q249E,2,abstract background non small cell lung cancer...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


In [4]:
df['Gene'].unique().shape

(264,)

In [5]:
X=df.copy()

In [6]:
X=X.drop(['Class'],axis=1)

In [7]:
X=X.drop(['ID'],axis=1)

In [8]:
Y=df['Class']

In [9]:
df.TEXT.astype('str')

0       cyclin dependent kinases cdks regulate variety...
1       abstract background non small cell lung cancer...
2       abstract background non small cell lung cancer...
3       recent evidence demonstrated acquired uniparen...
4       oncogenic mutations monomeric casitas b lineag...
                              ...                        
3316    introduction myelodysplastic syndromes mds het...
3317    introduction myelodysplastic syndromes mds het...
3318    runt related transcription factor gene runx al...
3319    runx aml gene frequent target chromosomal tran...
3320    frequent mutations associated leukemia recurre...
Name: TEXT, Length: 3321, dtype: object

In [10]:
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df['TEXT'] = df['TEXT'].astype('str').apply(clean_text)
df['TEXT'] = df['TEXT'].astype('str').str.replace('\d+', '')

In [11]:
df['TEXT'].values

array(['cyclin dependent kinases cdks regulate variety fundamental cellular processes cdk stands one last orphan cdks activating cyclin identified kinase activity revealed previous work shown cdk silencing increases ets v ets erythroblastosis virus e oncogene homolog driven activation mapk pathway confers tamoifen resistance breast cancer cells precise mechanisms cdk modulates ets activity generally functions cdk remain elusive demonstrate cdk cyclin dependent kinase identifying cyclin activating cyclin cyclin orphan cyclin product fama whose mutations cause star syndrome human developmental anomaly whose features include toe syndactyly telecanthus anogenital renal malformations show star syndrome associated cyclin mutants unable interact cdk cyclin silencing phenocopies cdk silencing increasing c raf conferring tamoifen resistance breast cancer cells cdk cyclin phosphorylates ets vitro cells positively controls ets degradation proteasome ets protein levels increased cells derived star

## Creating the Tokenizer object and fitting it with the vocabularity of text corpus

In [12]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['TEXT'].astype('str').values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 91934 unique tokens.


## Saving the Tokenizer and loading it

In [13]:
"""import pickle

# saving
with open('model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"""

"import pickle\n\n# saving\nwith open('model/tokenizer.pickle', 'wb') as handle:\n    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"

In [14]:
"""del tokenizer"""

'del tokenizer'

In [15]:
"""# loading
with open('model/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)"""

"# loading\nwith open('model/tokenizer.pickle', 'rb') as handle:\n    tokenizer = pickle.load(handle)"

In [16]:
X_Text = tokenizer.texts_to_sequences(df['TEXT'].astype('str').values)
X_Text = pad_sequences(X_Text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_Text.shape)

Shape of data tensor: (3321, 250)


In [17]:
X_Text

array([[9737,  168,  251, ..., 2693, 3799, 2628],
       [ 255,    4,  477, ..., 4394, 1621,  746],
       [ 255,    4,  477, ..., 4394, 1621,  746],
       ...,
       [ 341, 2256,  186, ...,  180,   38,   24],
       [ 726,  335,   50, ...,   52,  369, 1146],
       [ 726,  335,   50, ...,   52,  369, 1146]])

In [18]:
df['TEXT'].astype('str').values.shape

(3321,)

In [19]:
Y_Text = pd.get_dummies(df['Class']).values
print('Shape of label tensor:', Y_Text.shape)

Shape of label tensor: (3321, 9)


In [20]:
df.groupby(df['Class']).count()

Unnamed: 0_level_0,ID,Gene,Variation,TEXT
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,568,568,568,568
2,452,452,452,452
3,89,89,89,89
4,686,686,686,686
5,242,242,242,242
6,275,275,275,275
7,953,953,953,953
8,19,19,19,19
9,37,37,37,37


In [21]:
X_train_text, X_test_text, Y_train_text, Y_test_text = train_test_split(X_Text,Y_Text, test_size = 0.1, random_state = 42)
print(X_train_text.shape,Y_train_text.shape)
print(X_test_text.shape,Y_test_text.shape)

(2988, 250) (2988, 9)
(333, 250) (333, 9)


In [22]:
from tensorflow.python.keras.layers import LSTM, Dense,Embedding,SpatialDropout1D,Conv1D, MaxPooling1D, Dropout
from tensorflow.python.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
#from keras.callbacks import ModelCheckpoint, EarlyStopping

In [23]:
# Convolution
kernel_size = 50
filters = 64
pool_size = 4

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_Text.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
#model.add(Dropout(0.5))
model.add(LSTM(50, dropout=0.4, recurrent_dropout=0.2))
# model.add(Dropout(0.5))
# model.add(Dense(100,activation='relu'))
model.add(Dense(9, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 30
batch_size = 64

history = model.fit(X_train_text, Y_train_text, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 2689 samples, validate on 299 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


In [27]:
model_json = model.to_json()
with open("model_62_80.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_62_80.h5")

In [28]:
from tensorflow.python.keras.models import model_from_json
# from tensorflow import keras
# load json and create model
json_file = open('model_62_80.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model_62_80.h5")
print("Loaded model from disk")
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Loaded model from disk


In [29]:
accr = model.evaluate(X_test_text,Y_test_text)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
accr = model.evaluate(X_train_text,Y_train_text)
print('Train set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.216
  Accuracy: 0.637
Train set
  Loss: 0.557
  Accuracy: 0.801


In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 250, 100)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 201, 64)           320064    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 50, 64)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                23000     
_________________________________________________________________
dense (Dense)                (None, 9)                 459       
Total params: 5,343,523
Trainable params: 5,343,523
Non-trainable params: 0
______________________________________________

In [None]:
np.random.random(input_shape)[np.newaxis,...].shape

In [31]:
from tensorflow.python.keras.models import model_from_json
from keras import backend as K

#Loading the model
json_file = open('model_62_80.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model_62_80.h5")
print("Loaded model from disk")
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#Trying to get the output of lstm layer
inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functor = K.function([inp, K.learning_phase()], outputs )   # evaluation function

# Testing
input_shape=X_Text.shape[1]
# test = np.random.random(input_shape)[np.newaxis,...]
test=X_train_text[0][np.newaxis,...]
layer_outs = functor([test, 1.])
print(layer_outs[4].shape)
new_text=layer_outs[4]

Loaded model from disk

(1, 50)


In [32]:
X_Text.shape

(3321, 250)

In [34]:
pd.read_csv('transormed_1.csv').columns[-72:]

Index(['Variation_Y772_A775dup', 'Variation_Y791F', 'Variation_Y801H',
       'Variation_Y803N', 'Variation_Y806C', 'Variation_Y823D',
       'Variation_Y835F', 'Variation_Y842C', 'Variation_Y846C',
       'Variation_Y849C', 'Variation_Y849S', 'Variation_Y87C',
       'Variation_Y87N', 'Variation_Y901C', 'Variation_Y931C',
       'Variation_Y98H', 'Variation_Y98N', 'Variation_YAP1-FAM118B Fusion',
       'Variation_YAP1-MAMLD1 Fusion', 'Variation_ZC3H7B-BCOR Fusion',
       'Variation_ZNF198-FGFR1 Fusion', 'Variation_p61BRAF', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15',
       '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27',
       '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39',
       '40', '41', '42', '43', '44', '45', '46', '47', '48', '49'],
      dtype='object')

In [35]:
from keras import backend as K

inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functor = K.function([inp, K.learning_phase()], outputs )   # evaluation function

# Testing
input_shape=X_Text.shape[1]
# test = np.random.random(input_shape)[np.newaxis,...]
test=X_train_text[0][np.newaxis,...]
layer_outs = functor([test, 1.])
print(layer_outs[4].shape)

(1, 50)


In [36]:
layer_outs

[array([[[ 0.00583301, -0.05062416,  0.00659524, ..., -0.00127503,
          -0.04430197, -0.0349465 ],
         [ 0.00125504,  0.03705791,  0.03220299, ...,  0.04336581,
          -0.07892461,  0.00221441],
         [-0.001146  , -0.02411515,  0.06484778, ...,  0.02242417,
          -0.00965122, -0.02020527],
         ...,
         [ 0.04670335,  0.01182109,  0.13895577, ..., -0.01640785,
           0.00200837,  0.03418484],
         [ 0.06936964,  0.02474812, -0.0600392 , ..., -0.00681352,
           0.02312789,  0.02421081],
         [-0.04274713, -0.00165123, -0.00705071, ...,  0.01860771,
          -0.04025328,  0.01690738]]], dtype=float32),
 array([[[ 0.00729126, -0.06328019,  0.00824405, ..., -0.00159379,
          -0.05537746, -0.04368313],
         [ 0.0015688 ,  0.04632238,  0.04025373, ...,  0.05420726,
          -0.09865576,  0.00276801],
         [-0.0014325 , -0.03014393,  0.08105972, ...,  0.02803021,
          -0.01206403, -0.02525659],
         ...,
         [ 0.05837

## Text representations from lstm model

In [37]:
d={}
for i in range(50):
    d[i]=[]

In [38]:
X_Text[0]

array([ 9737,   168,   251,   526,     5,   159, 14322, 17910,     2,
        6189,   256, 12180, 18981,   956,    45,  9230,  1678,   202,
        2310,   357,    81,  6338,  3958, 40624, 40625,   420,   656,
       18981,     1,   678,   238,  1059,  1760, 18981, 18981,   271,
         814, 10345,   833,   114,    64, 18981,   256,    54, 10345,
        1206, 18981, 18981,    78,     9,  1212,  1172,   886, 10345,
       18981,  3412,  1604,    94, 10345,     1,   631,  6312,   420,
          53,  4183,    81,    35,  6322,  1303,    67,    81,  4054,
         255,  2606,   511,  6322,  4887,   256,  2513,   112,   190,
         159,  1128,   268,  3902,  1053,  6312,   420,  1656,  2267,
         678,   853, 10345,   395,   555,   159,    72,   381,  1081,
           2,    67,    73, 10345,    86,  2725,  2771,   520,   167,
         800, 10345,   395,    16,   159,   150,     2,    78,   219,
          78,     9,  1212,  1172,  2491, 10345,     5,   159,    54,
         111,    56,

In [39]:
# d={'text_representations_63_80':[]}
for tokens in X_Text:
    test=tokens[np.newaxis,...]
    layer_outs = functor([test, 1.])
    l=layer_outs[4].flatten().tolist()
    for i,rep in enumerate(l):
        d[i].append(l[i])

In [40]:
pd.DataFrame(d).to_csv('text_rep_lstm.csv',index=False)

In [41]:
X=X.drop(['TEXT'],axis=1)

In [42]:
X_dummies=pd.get_dummies(X)

In [43]:
X_dummies.shape

(3321, 3260)

In [44]:
df_new=pd.concat([X_dummies,pd.DataFrame(d)],axis=1)

In [55]:
df_new.shape

(3321, 3310)

In [56]:
df_new.to_csv('transormed_1.csv',index=False)

In [57]:
Y.to_csv('labels.csv',index=False)

## Loading saved data

In [58]:
import pandas as pd
df_new=pd.read_csv('transormed_1.csv')
Y=pd.read_csv('labels.csv')

In [65]:
df_new.shape

(3321, 3310)

## SVD for decomposition of whole components

In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD

# A = np.array([[-1, 2, 0], [2, 0, -2], [0, -2, 1]])
# print("Original Matrix:")
# print(A)

svd =  TruncatedSVD(n_components = 1000)
A_transf = svd.fit_transform(df_new)

# print("Singular values:")
# print(svd.singular_values_)

print("Transformed Matrix after reducing to 2 features:")
print(A_transf.shape)

In [64]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df_new,Y,test_size=0.2,random_state=0)

In [66]:
x_train.shape

(2656, 3310)

In [67]:
df_new.shape

(3321, 3310)

In [74]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(min_samples_split=10,min_impurity_split=.3,n_estimators=201,max_leaf_nodes=2500,
                          max_depth=15)
rf.fit(x_train,y_train)

  after removing the cwd from sys.path.
















RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=2500, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=0.3,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=201,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [75]:
print("Train Accuracy:",rf.score(x_train,y_train))
print("Test Accuracy:",rf.score(x_test,y_test))

Train Accuracy: 0.8475150602409639
Test Accuracy: 0.7593984962406015


In [76]:
rf=RandomForestClassifier(n_estimators=201)
rf.fit(x_train,y_train)

print("Train Accuracy:",rf.score(x_train,y_train))
print("Test Accuracy:",rf.score(x_test,y_test))

  


Train Accuracy: 1.0
Test Accuracy: 0.7518796992481203


In [77]:
import pickle
pickle.dump(rf,open('rf_model_89_76.pickle','wb'))

In [None]:
!pip install xgboost

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from xgboost import XGBClassifier
xgb=XGBClassifier(n_estimators=500,learning_rate=0.01,max_depth=3,n_jobs=5,subsample=0.8,colsample_bytree=1,gamma=.5)
xgb.fit(x_train,y_train)

In [None]:
print("Train accuracy of xgb:",accuracy_score(xgb.predict(x_train),y_train))
print("Test accuracy of xgb:",accuracy_score(xgb.predict(x_test),y_test))

In [None]:
import pickle
pickle.dump(rf,open('model/rf_text_representation_lstm_50_76_90.pickle','wb'))

In [None]:
rf

## word2vec 

In [None]:
from gensim.models import Word2Vec

In [None]:
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(l, min_count=1,size=1)
# summarize the loaded model
# print(model)
# summarize vocabulary
words = list(model.wv.vocab)
# print(words)
# access vector for one word
# print(model['sentence'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

In [None]:
l

In [None]:
model.wv[l[0]].shape

## Word2Vec pretrained model

In [None]:
from gensim.models import Word2Vec

In [None]:
from nltk.tokenize import word_tokenize
word_tokenize('hi how are you?. I am fine.')

In [None]:
sentence=[['Neeraj','Boy'],['Sarwan','is'],['good','boy']]
model = Word2Vec(sentence, min_count=1,size=300,workers=4)
print(model.similarity('good', 'boy'))

In [None]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [None]:
s='Hello everyone. Welcome to GeeksforGeeks. You are studying NLP article'

In [None]:
word_tokenize(sent_tokenize(df['TEXT'].astype('str').values[0]))

In [None]:
l=[]
for i in range(df.shape[0]):
    l.append(word_tokenize(df['TEXT'].astype('str').values[i]))
    print(i)

## Production

In [53]:
import tensorflow as tf
tf.__version__

'1.14.0'

In [61]:
a=np.array([0 for i in range(len(df_new.columns[:-50]))])
len(a)

3260

In [62]:
pd.DataFrame(a.reshape(1,len(a)),columns=df_new.columns[:-50]).to_csv('df_new.csv',index=False)

In [None]:
df['TEXT'].astype('str')[3]

In [None]:
X_Text[0].shape

## Complete code for production

In [None]:
df.iloc[0]

In [63]:
from tensorflow.python.keras.models import model_from_json
from keras import backend as K
import pickle
from nltk.tokenize import word_tokenize

gene=df['Gene'].values
variation=df['Variation'].values
text=df['TEXT'].astype('str').values
#Text must be transformed to dataframe and then need to obtain its value to get in proper shape as that of training data
text=pd.Series(text).astype('str').values

#loading the tokenizer object from memory
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
# Transforming the text using tokenizer object
text = tokenizer.texts_to_sequences(text)
text = pad_sequences(text, maxlen=MAX_SEQUENCE_LENGTH)
text=text.reshape(text.shape[1],)

#Loading the LSTM model for obtaining better text representation
json_file = open('model_62_80.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model_62_80.h5")
print("Loaded model from disk")
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#Trying to get the output of lstm layer
inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functor = K.function([inp, K.learning_phase()], outputs )   # evaluation function

# Testing
df_test=pd.read_csv('model/df_new.csv')
input_shape=250
# test = np.random.random(input_shape)[np.newaxis,...]
test=text[np.newaxis,...]
layer_outs = functor([test, 1.])
print(layer_outs[4].shape)
text=layer_outs[4]
text=pd.DataFrame(text,columns=df_test.columns[-50:])

#Modifying gene and variation appropriately according to column names in dataframe
gene='Gene_'+gene
variation='Variation_'+variation
df_test[gene]=1
df_test[variation]=1
x=pd.concat([df_test,text],axis=1)
with open('rf_text_representation_lstm_50_76_90.pickle', 'rb') as handle:
    rf = pickle.load(handle)

# with open('model/rf_model.pkl','rb') as handle:
#     rf = pickle.load(handle)

res=rf.predict_proba(x)
print(res)
print("Class predicted is:",np.argmax(res)+1,' with confidence:',np.round(res[0][np.argmax(res)]*100,2),'%')
print("Actual Class:",Y[index])

FileNotFoundError: [Errno 2] No such file or directory: 'model/tokenizer.pickle'

In [None]:
s=0
for 

In [None]:
s

In [None]:
df.head()

In [None]:
res[1][1]

In [None]:
df_new.columns.shape

In [None]:
df_new.columns[:-50]

In [None]:
np.array(df_new.columns)

In [None]:
df_test=pd.DataFrame(a.reshape(1,len(a)),columns=df_new.columns[:-50])
df_test[gene]=1
df_test[variation]=1

In [None]:
with open('model/rf_text_representation_lstm_50_76_90.pickle', 'rb') as handle:
    rf = pickle.load(handle)

In [None]:
rf

In [None]:
Y

In [None]:
for col in df_test.columns:
    if df_test[col][0]==1:
        print(col)

In [None]:
a=np.array([0 for i in range(len(df_new.columns[:-50]))])

In [None]:
## Need to create a data frame where only given gene and variation are 1 others are all zeros