In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
property_dict = {
    "A": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "R": ["Polar", "Positive", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "N": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "D": ["Polar", "Negative", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "C": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "Q": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "E": ["Polar", "Negative", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "G": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "H": ["Polar", "Positive", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "I": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "L": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "K": ["Polar", "Positive", 'Hydrophilic', 'NonAromatic', 'Ionizable'],
    "M": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "F": ["NonPolar", 'Neutral', 'Hydrophobic', "Aromatic", 'NonIonizable'],
    "P": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable'],
    "S": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "T": ["Polar", 'Neutral', 'Hydrophilic', 'NonAromatic', 'NonIonizable'],
    "W": ["NonPolar", 'Neutral', 'Hydrophobic', "Aromatic", 'NonIonizable'],
    "Y": ["Polar", 'Neutral', 'Hydrophobic', "Aromatic", 'Ionizable'],
    "V": ["NonPolar", 'Neutral', 'Hydrophobic', 'NonAromatic', 'NonIonizable']
}

mapping = {'Hydrophobic':0, 'Hydrophilic':1, 'Neutral':0, 'Positive':1, 'Negative':-1, 'Polar':1, 'NonPolar':0, 'Aromatic':1, 'NonAromatic':0, 'Ionizable':1, 'NonIonizable':0}



In [4]:
data_path = "data"

In [5]:
data = pd.read_csv(os.path.join(data_path, "data_struct_features.csv"))

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,our_id,metadata,position,amino_acid,data,labels
0,0,0,1AHW_ED C 4,0,t,"[[0.5, 0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0....",0
1,1,0,1AHW_ED C 4,1,n,"[[0.5, 0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0....",0
2,2,0,1AHW_ED C 4,2,t,"[[0.5, 0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0....",0
3,3,0,1AHW_ED C 4,3,v,"[[0.5, 0.5, 0.5, 0.5, 0.5], [1.0, 0.0, 1.0, 0....",0
4,4,0,1AHW_ED C 4,4,a,"[[1.0, 0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 1.0, 0....",0


In [9]:
def get_train_test(df, train_size, test_size, val_size):
    # first select train and test samples
    inds = np.random.permutation(len(np.unique(df["our_id"])))
    train_inds = inds[:int(train_size*len(inds))]
    test_inds = inds[int(train_size*len(inds)): int(train_size*len(inds))+int(test_size*len(inds))]
    val_inds = inds[int(train_size*len(inds))+int(test_size*len(inds)):]
    
    train_data = df[df['our_id'].isin(train_inds)]
    test_data = df[df['our_id'].isin(test_inds)]
    val_data = df[df['our_id'].isin(val_inds)]

    x_train = np.array([d for d in train_data["data"].values])
    y_train = np.array(train_data["labels"])
    x_test = np.array([d for d in test_data["data"].values])
    y_test = np.array(test_data["labels"])
    x_val = np.array([d for d in val_data["data"].values])
    y_val = np.array(val_data["labels"])

    print("number of protein sequences:", len(train_inds), len(test_inds), len(val_inds))
    print("train:", x_train.shape, y_train.shape,"val:", x_val.shape, y_val.shape, "test:", x_test.shape, y_test.shape)
    return x_train, y_train, x_val, y_val, x_test, y_test

In [10]:
train_size = 0.7
test_size = 0.2
val_size = 0.1
x_train, y_train, x_val, y_val, x_test, y_test = get_train_test(data, train_size, test_size, val_size)

number of protein sequences: 181 51 27
train: (45560,) (45560,) val: (5847,) (5847,) test: (10019,) (10019,)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers.convolutional import Conv1D
from tensorflow.keras.layers.convolutional import MaxPooling1D

In [11]:
!ls

README.md         environment.yml   train_model.ipynb
[1m[36mdata[m[m              prepareData.ipynb


In [None]:
window_size = 9
n_features = 5

epochs = 10
batch_size = 16

In [None]:
# START CNN
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(window_size,n_features)))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
# model.add(Dropout(0.5))
# model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

In [None]:
# evaluate model
_, accuracy = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=0)
print("accuracy", accuracy)