In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import plotly.express as px

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
df = pd.read_csv(url, names = names)

In [4]:
df.head(7)

Unnamed: 0,Class,id,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
5,+,MALEFG,\taggggcaaggaggatggaaagaggttgccgtataaagaaactag...
6,+,MALK,\t\tcagggggtggaggatttaagccatctcctgatgacgcatagt...


In [5]:
df.shape

(106, 3)

In [6]:
df["Sequence"][0]

'\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt'

In [7]:
df.columns

Index(['Class', 'id', 'Sequence'], dtype='object')

In [8]:
df.dtypes

Class       object
id          object
Sequence    object
dtype: object

In [9]:
classes = df.loc[:,  'Class']
classes.tail()

101    -
102    -
103    -
104    -
105    -
Name: Class, dtype: object

In [10]:
classes.value_counts()

Class
+    53
-    53
Name: count, dtype: int64

In [11]:
# Generate list of DNA sequence
sequence = list(df.loc[:, "Sequence"])
sequence[8:13]

['\t\tcgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt',
 '\tttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca',
 '\tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg',
 '\tcctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac',
 '\tgatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga']

In [12]:
# Remove tab from each sequence and creating a dictionary of data
dic = {}
for k, seq in enumerate(sequence):
  nucleotides = list(seq)
  #print(nucleotides)
  nucleotides = [char for char in nucleotides if char != '\t' ]
  #print(nucleotides)
  nucleotides.append(classes[k])
  print(nucleotides)
  dic[k] = nucleotides

dic[1]

['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']
['t', 'g', 'c', 't', 'a', 't', 'c', 'c', 't', 'g', 'a', 'c', 'a', 'g', 't', 't', 'g', 't', 'c', 'a', 'c', 'g', 'c', 't', 'g', 'a', 't', 't', 'g', 'g', 't', 'g', 't', 'c', 'g', 't', 't', 'a', 'c', 'a', 'a', 't', 'c', 't', 'a', 'a', 'c', 'g', 'c', 'a', 't', 'c', 'g', 'c', 'c', 'a', 'a', '+']
['g', 't', 'a', 'c', 't', 'a', 'g', 'a', 'g', 'a', 'a', 'c', 't', 'a', 'g', 't', 'g', 'c', 'a', 't', 't', 'a', 'g', 'c', 't', 't', 'a', 't', 't', 't', 't', 't', 't', 't', 'g', 't', 't', 'a', 't', 'c', 'a', 't', 'g', 'c', 't', 'a', 'a', 'c', 'c', 'a', 'c', 'c', 'c', 'g', 'g', 'c', 'g', '+']
['a', 'a', 't', 't', 'g', 't', 'g', 'a', 't', 'g', 't', 'g', 't', 'a', 't', 'c', 'g', 'a', 'a', 'g', 't', 'g', 't', 'g', 't', '

['t',
 'g',
 'c',
 't',
 'a',
 't',
 'c',
 'c',
 't',
 'g',
 'a',
 'c',
 'a',
 'g',
 't',
 't',
 'g',
 't',
 'c',
 'a',
 'c',
 'g',
 'c',
 't',
 'g',
 'a',
 't',
 't',
 'g',
 'g',
 't',
 'g',
 't',
 'c',
 'g',
 't',
 't',
 'a',
 'c',
 'a',
 'a',
 't',
 'c',
 't',
 'a',
 'a',
 'c',
 'g',
 'c',
 'a',
 't',
 'c',
 'g',
 'c',
 'c',
 'a',
 'a',
 '+']

In [13]:
# Convert dictionary object into dataframe
df_dna = pd.DataFrame(dic)
df_dna.head(8)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,t,t,g,a,t,a,c,t,c,t,...,c,c,t,a,g,c,g,c,c,t
1,a,g,t,a,c,g,a,t,g,t,...,c,g,a,g,a,c,t,g,t,a
2,c,c,a,t,g,g,g,t,a,t,...,g,c,t,a,g,t,a,c,c,a
3,t,t,c,t,a,g,g,c,c,t,...,a,t,g,g,a,c,t,g,g,c
4,a,a,t,g,t,g,g,t,t,a,...,g,a,a,g,g,a,t,a,t,a
5,g,t,a,t,a,c,g,a,t,a,...,t,g,c,g,c,a,c,c,c,t
6,c,c,g,g,a,a,g,c,a,a,...,a,g,c,t,a,t,t,t,c,t
7,a,c,a,a,t,a,t,a,a,t,...,g,a,g,g,t,g,c,a,t,a


In [14]:
df_dna[0]

0     t
1     a
2     c
3     t
4     a
5     g
6     c
7     a
8     a
9     t
10    a
11    c
12    g
13    c
14    t
15    t
16    g
17    c
18    g
19    t
20    t
21    c
22    g
23    g
24    t
25    g
26    g
27    t
28    t
29    a
30    a
31    g
32    t
33    a
34    t
35    g
36    t
37    a
38    t
39    a
40    a
41    t
42    g
43    c
44    g
45    c
46    g
47    g
48    g
49    c
50    t
51    t
52    g
53    t
54    c
55    g
56    t
57    +
Name: 0, dtype: object

In [15]:
# Transpose dataframe into correct format
df_dna = df_dna.transpose()
df_dna.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
101,c,c,t,c,a,a,t,g,g,c,...,g,a,a,c,t,a,t,a,t,-
102,g,t,a,t,t,c,t,c,a,a,...,t,c,a,a,c,a,t,t,g,-
103,c,g,c,g,a,c,t,a,c,g,...,a,a,g,g,c,t,t,c,c,-
104,c,t,c,g,t,c,c,t,c,a,...,a,g,g,a,g,g,a,a,c,-
105,t,a,a,c,a,t,t,a,a,t,...,t,c,a,a,g,a,a,c,t,-


In [16]:
# Rename the 57th column as Class
df_dna.rename(columns = {57: 'Class'}, inplace = True)

In [17]:
df_dna.head(7)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+
5,a,g,g,g,g,c,a,a,g,g,...,c,g,t,t,t,a,g,g,t,+
6,c,a,g,g,g,g,g,t,g,g,...,a,t,c,a,t,g,a,a,t,+


In [18]:
temp = df_dna.copy(deep=True)
temp = temp.drop(['Class'], axis = 1)
temp.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
101,c,c,t,c,a,a,t,g,g,c,...,g,g,a,a,c,t,a,t,a,t
102,g,t,a,t,t,c,t,c,a,a,...,t,t,c,a,a,c,a,t,t,g
103,c,g,c,g,a,c,t,a,c,g,...,c,a,a,g,g,c,t,t,c,c
104,c,t,c,g,t,c,c,t,c,a,...,a,a,g,g,a,g,g,a,a,c
105,t,a,a,c,a,t,t,a,a,t,...,a,t,c,a,a,g,a,a,c,t


In [19]:
# One hot encoding the data
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(temp)
print(enc.categories_)
df_dna_1 = enc.transform(temp).toarray()
del temp

[array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a', 'c', 'g', 't'], dtype=object), array(['a'

In [20]:
# Saving the one hot encoder
with open("/content/drive/MyDrive/DNA_E_Coli/ecoli_encoder.pickle", "wb") as file:
  pickle.dump(enc, file)

In [21]:
df_new = pd.DataFrame(df_dna_1)
df_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,218,219,220,221,222,223,224,225,226,227
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
df_dna["Class"] = df_dna["Class"].replace(to_replace =["+"], value =1)
df_dna["Class"] = df_dna["Class"].replace(to_replace =["-"], value =0)
df_new["Classes"] = df_dna['Class']
df_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,219,220,221,222,223,224,225,226,227,Classes
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [23]:
# Training and testing the classification algorithm

In [24]:
y = df_new['Classes'].values
X = df_new.drop(['Classes'], axis=1)

seed = 1

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [25]:
SNet = MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=300,  activation='relu', solver='adam', random_state=1)
SNet.fit(X_train, y_train)

In [26]:
print(SNet.score(X_train, y_train))

1.0


In [27]:
y_pred = SNet.predict(X_test)

In [28]:
print(SNet.score(X_test,y_test))

0.9259259259259259


In [29]:
# Model evaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94        17
           1       0.83      1.00      0.91        10

    accuracy                           0.93        27
   macro avg       0.92      0.94      0.92        27
weighted avg       0.94      0.93      0.93        27



In [30]:
# Importing confusion matrix
# Importing the predictions against the actual observations in y_val
cm = confusion_matrix(y_pred, y_test)
fig = px.imshow(cm, text_auto=True)
fig.show()

In [31]:
# Plotting graph for mlpclassifier
loss_values = SNet.loss_curve_
fig = px.line(loss_values )
fig.show()

In [32]:
with open('/content/drive/MyDrive/DNA_E_Coli/ecoli_net.pickle', 'wb') as file:
    pickle.dump(SNet, file)

In [33]:
genome = "ttactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcg"
# genome_false = "ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaac"
genome_list = list(genome)
print(genome_list)
df_test = pd.DataFrame(genome_list)
df_test = df_test.transpose()

['t', 't', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g']


In [34]:
with open('/content/drive/MyDrive/DNA_E_Coli/ecoli_encoder.pickle', 'rb') as file:
    encoder = pickle.load(file)

In [35]:
data_test = encoder.transform(df_test).toarray()
print(SNet.predict(data_test))

[1]
