# Classify Enzymes

In [1]:
# Import libraries
import os
import time
import string
import sklearn
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [12]:
#### Import data ####

##Define filepaths
filepath = r"C:/Users/Swata/OneDrive/Documents/Final_data/"

hydrolases_p = r"C:\Users\Swata\OneDrive\Documents\Final_data\Hydrolases.csv"
isomerases_p = r"C:\Users\Swata\OneDrive\Documents\Final_data\Isomerases.csv"
ligases_p = r"C:\Users\Swata\OneDrive\Documents\Final_data\Ligases.csv"
oxidoreductases_p = r"C:\Users\Swata\OneDrive\Documents\Final_data\Oxidoreductases.csv"
lyases_p = r"C:\Users\Swata\OneDrive\Documents\Final_data\Lyases.csv"
translocases_p = r"C:\Users\Swata\OneDrive\Documents\Final_data\Translocases.csv"
transferases_p = r"C:\Users\Swata\OneDrive\Documents\Final_data\Transferases.csv"

## Load data
oxidoreductases = pd.read_csv(oxidoreductases_p).drop('Unnamed: 0',axis=1)
transferases = pd.read_csv(transferases_p).drop('Unnamed: 0',axis=1)
hydrolases = pd.read_csv(hydrolases_p).drop('Unnamed: 0',axis=1)
lyases = pd.read_csv(lyases_p).drop('Unnamed: 0',axis=1)
isomerases = pd.read_csv(isomerases_p).drop('Unnamed: 0',axis=1)
ligases = pd.read_csv(ligases_p).drop('Unnamed: 0',axis=1)
translocases = pd.read_csv(translocases_p).drop('Unnamed: 0',axis=1)

In [13]:
# Add labels:
oxidoreductases["label"] = np.ones_like(oxidoreductases["mol_w"])
transferases["label"] = np.ones_like(transferases["mol_w"]) * 2
hydrolases["label"] = np.ones_like(hydrolases["mol_w"]) * 3
lyases["label"] = np.ones_like(lyases["mol_w"]) * 4
isomerases["label"] = np.ones_like(isomerases["mol_w"]) * 5 
ligases["label"] = np.ones_like(ligases["mol_w"]) * 6 
translocases["label"] = np.ones_like(translocases["mol_w"]) * 7

# Concat all the data
data = pd.concat([oxidoreductases,transferases,hydrolases,lyases,isomerases,ligases,translocases],axis=0)

data.describe(),data.shape

(              mol_w        Residue      avg_res_w         charge  \
 count  1.623250e+05  162325.000000  162325.000000  162325.000000   
 mean   4.857364e+04     437.743447     110.754803      -0.463271   
 std    4.435811e+04     397.336859       3.260236      12.938412   
 min    9.599000e+02      11.000000      87.264000    -744.500000   
 25%    2.805659e+04     255.000000     108.560000      -6.000000   
 50%    3.930546e+04     357.000000     110.871000      -0.500000   
 75%    5.535015e+04     499.000000     113.022000       5.000000   
 max    3.906488e+06   35213.000000     130.128000     314.000000   
 
          Iso_e_point  Mol_ext_coef  Mol_ext_coef_cys    ext_coef_mg  \
 count  162325.000000  1.623250e+05      1.623250e+05  162325.000000   
 mean        6.646936  4.757675e+04      4.791311e+04       0.946623   
 std         1.671280  5.339003e+04      5.380054e+04       0.495873   
 min         2.563800  0.000000e+00      0.000000e+00       0.000000   
 25%         5.25

In [None]:
# Export data 


In [17]:
data_OHE = pd.get_dummies(data,columns=['label'])
data_OHE,

(          mol_w  Residue  avg_res_w  charge  Iso_e_point  Mol_ext_coef  \
 0     118582.13   1063.0    111.554    49.0       9.7468      102220.0   
 1      25239.64    217.0    116.312     4.5       8.4795       40450.0   
 2     120753.42   1081.0    111.705    58.0       9.9758       99240.0   
 3      24229.78    212.0    114.291     2.5       7.4209       40450.0   
 4      42669.79    387.0    110.258   -11.0       4.7574       39420.0   
 ...         ...      ...        ...     ...          ...           ...   
 7290  173671.31   1543.0    112.554    33.0       8.8470      220590.0   
 7291  163735.24   1501.0    109.084    14.0       7.1717      213140.0   
 7292   76063.42    655.0    116.127    -9.0       5.6740       45380.0   
 7293   76145.73    655.0    116.253    -9.0       5.6723       39880.0   
 7294  171791.71   1531.0    112.209    14.0       7.7038      238010.0   
 
       Mol_ext_coef_cys  ext_coef_mg  ext_coef_mg_cys  \
 0             103220.0        0.862     

In [62]:
data_numpy = data_OHE.to_numpy()
x = data_numpy[:,0:106]
y = data_numpy[:,106:]
x.shape, y.shape

((162325, 106), (162325, 7))

In [65]:
x_norm = np.zeros_like(x)
for i in range(x.shape[1]):
    tmp_arr = x[:,i]
    mean = np.mean(tmp_arr)
    tmp_arr2 = (tmp_arr - mean) / 162325
    x_norm[:,i] = tmp_arr2
x_norm, x_norm.shape

(array([[ 4.31285966e-01,  3.85188081e-03,  4.92343998e-06, ...,
          3.24764561e-05,  5.05568899e-04,  3.09407051e-06],
        [-1.43748625e-01, -1.35988570e-03,  3.42350063e-05, ...,
          1.42475942e-05, -1.65923477e-04,  9.25581372e-07],
        [ 4.44662155e-01,  3.96276946e-03,  5.85367253e-06, ...,
          3.45525380e-05,  4.99408419e-04,  1.20896348e-06],
        ...,
        [ 1.69350281e-01,  1.33840476e-03,  3.30953174e-05, ...,
          3.81132958e-05,  4.62445536e-04,  4.56075466e-05],
        [ 1.69857350e-01,  1.33840476e-03,  3.38715379e-05, ...,
          3.71707423e-05,  4.56285055e-04,  4.46649930e-05],
        [ 7.59082547e-01,  6.73498570e-03,  8.95855472e-06, ...,
         -1.06530680e-05,  6.10297068e-04, -1.41614354e-05]]),
 (162325, 106))

In [75]:
X_train, X_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.30)
# X_train = tf.convert_to_tensor(X_train)
# X_test =  tf.convert_to_tensor(X_test)
# y_train = tf.convert_to_tensor(y_train)
# y_test = tf.convert_to_tensor(y_test)

In [70]:
# data_numpy = data.to_numpy()
# data = data_numpy
# Convert to numpy
# for i in range(data.shape[0]):
#     if data[i,106] == 1:
#         data[i,106] = np.array([1,0,0,0,0,0,0])
#     elif data[i,106] == 2:
#         data[i,106] = np.array([0,1,0,0,0,0,0])
#     elif data[i,106] == 3:
#         data[i,106] = np.array([0,0,1,0,0,0,0])
#     elif data[i,106] == 4:
#         data[i,106] = np.array([0,0,0,1,0,0,0])
#     elif data[i,106] == 5:
#         data[i,106] = np.array([0,0,0,0,1,0,0])
#     elif data[i,106] == 6:
#         data[i,106] = np.array([0,0,0,0,0,1,0])
#     elif data[i,106] == 7:
#         data[i,106] = np.array([0,0,0,0,0,0,1])



In [77]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(106),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dense(7,activation='softmax')
])
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [79]:
model.fit(X_train,y_train,epochs=15)

Epoch 1/15
Epoch 2/15

In [None]:
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)

print('\nTest accuracy:', test_acc)

554/554 - 3s - loss: 0.6279 - accuracy: 0.6467 - 3s/epoch - 5ms/step

Test accuracy: 0.6466572880744934
