In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from itertools import repeat
df = pd.read_csv("CombinedSequences_ppr.csv")

In [2]:
seq = df['Sequence'].to_numpy()

def kmerise(line,k):  # String and k are inputs
    kmerised_array = []; # Final kmerised list
    for i in line:
        temp = []
        for j in range(0,len(i),k): 
            temp.append(i[j:j+k]) #append substring in each iter 
        kmerised_array.append(temp)
    
    return kmerised_array

In [3]:
df['AccessionID'] = pd.factorize(df['AccessionID'])[0]
df.head()

Unnamed: 0,AccessionID,Sequence
0,0,ATGAGAGTTCAAAGACCACCCACTCTCTTGTTAGTGTTCTCACTCT...
1,0,ATGATACACTCAGTGTTTCTACTGATGTTCTTGTTAACACCTACAG...
2,0,TATGCCTAACATGTGTAGGATTTTCGCGTCTCTGATTTTGGCACGC...
3,0,TATGCCTAACATGTGTAGGATTTTTGCATCTCTGATTTTGGCACGC...
4,0,TATGCCTAACATGTGTAGGATTTTCGCGTCTCTGATTTTGGCACGC...


In [4]:
df.tail()

Unnamed: 0,AccessionID,Sequence
2519,2,GACTAAAGATAAAAATTATATACGTATAATTTTTGTCTCTCTAGCT...
2520,2,GACAAAGGTGAAAATTAATATATATATATTAATTTTACTCCTCCTC...
2521,2,GTCATTTGGTAAAATATAATATATATTTTATCACTCTAGCTTCGCT...
2522,2,GACAAAGCTCAAAAATATATATATATATTTTTGTTGCTCCTAGCTT...
2523,2,GACTAAAGATAAAAATTATAGCATTAGTCTATAATTTTATCTCCCT...


In [5]:
k = 15
kmers_list =  kmerise(seq,k)

In [6]:
baseDict = {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'R': 5, 'Y': 6, 'S': 7, 'W': 8, 'K': 9, 'M': 10, 'B': 11, 'D': 12, 'H': 13, 'V': 14,'N': 15}

In [7]:
def vectorizedList(line):  # String and k are inputs
    vectorized_array = []; # Final vectorised list
    for i in line:
        temp = ""
        for j in range(len(i)):
            temp += str(baseDict[i[j]])
        vectorized_array.append(int(temp))
    
    return vectorized_array

In [8]:
Vec = []
len_vec = []
for i in range(len(kmers_list)):
    Vec.append(vectorizedList(kmers_list[i]))
    len_vec.append(len(kmers_list[i]))
#Vec

In [9]:
f = [[float(i) for i in j] for j in Vec]
Vec = [[i / sum(j) for i in j] for j in f]

In [10]:
def padding(seq, len_vec):
    anotherseq = seq.copy()
    max_kmerlen = max(len_vec)
    for i in anotherseq:
        diff = max_kmerlen - len(i)
        i.extend(repeat(0, diff))
    
    return anotherseq    

In [11]:
pad = padding(Vec, len_vec)

In [12]:
import keras
from keras.layers import Dense, Embedding, Bidirectional, LSTM
from keras.layers import Input, Dropout, Activation
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [13]:
X = np.asarray(pad)#.astype(np.float32)
y = df['AccessionID'].to_numpy()#.astype(np.float32)

In [14]:
X.shape

(2524, 2118)

In [54]:
x_train, x_test, y_train, Y_test = train_test_split(X, y, test_size=0.33)

In [16]:
x_train = x_train.astype('float32') 
x_test = x_test.astype('float32') 

In [18]:
y_train = keras.utils.to_categorical(y_train) 
y_test = keras.utils.to_categorical(y_test) 

In [19]:
print(x_train.shape)
print(x_test.shape)

(1691, 2118)
(833, 2118)


In [20]:
print(y_train.shape)
print(y_test.shape)

(1691, 3)
(833, 3)


In [21]:
from keras.utils.np_utils import to_categorical
from keras.models import Model ,Sequential
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [47]:
# Input for variable-length sequences of integers
model = Sequential()

# Embed each integer in a 256-dimensional vector
model.add(Embedding(x_train.shape[1], 128))

# Add 2 bidirectional LSTMs
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(16)))

# Add a classifier
model.add(Dense(3, activation="sigmoid"))

In [48]:
model.compile(optimizer='adam', loss = "mean_squared_error", metrics = ['accuracy']) 

In [49]:
model.fit(x_train, y_train, epochs = 8)
model.summary()

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 128)         271104    
_________________________________________________________________
bidirectional_20 (Bidirectio (None, None, 128)         98816     
_________________________________________________________________
bidirectional_21 (Bidirectio (None, 64)                41216     
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 195       
Total params: 411,331
Trainable params: 411,331
Non-trainable params: 0
_________________________________________________________________


In [51]:
score = model.evaluate(x_test, y_test, verbose=0) 
print('loss=', score[0])
print('accuracy=', score[1])

loss= 0.14713913202285767
accuracy= 0.6614645719528198


In [52]:
y_pred = model.predict(x_test)
Y_pred_classes = np.argmax(y_pred,axis=1)
y_act = np.argmax(y_test,axis=1)

In [55]:
conf_matrix = np.zeros((3,3))
for i in range(len(y_act)):
    conf_matrix[Y_pred_classes[i]][y_act[i]] += 1
conf_matrix

array([[  0.,   0.,   0.],
       [272., 551.,  10.],
       [  0.,   0.,   0.]])

In [1]:
precision = precision_score(Y_test, Y_pred_classes, average='micro')
recall = recall_score(Y_test, Y_pred_classes, average='micro')
f1score = f1_score(Y_test, Y_pred_classes, average='micro')
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1score)

NameError: name 'precision_score' is not defined