In [2]:
import torch
import torch.nn as nn
import os
import torch.optim as optim

In [3]:
class Protein_Encoder(nn.Module):
    def __init__(self, input_feature_size, hidden_layers_size, num_hidden_layers, embedding_size):
        super(Protein_Encoder, self).__init__()
        self.input_feature_size = input_feature_size
        self.hidden_layers_size = hidden_layers_size
        self.num_hidden_layers = num_hidden_layers
        self.embedding_size = embedding_size

        # input layer
        self.linear_start = nn.Linear(self.input_feature_size, self.hidden_layers_size)
        self.relu_start = nn.ReLU()
        # hidden layers and relu activation
        self.hidden_layers = nn.ModuleList([nn.Linear(self.hidden_layers_size, self.hidden_layers_size) for i in range(self.num_hidden_layers)])
        self.hidden_layers_activation = nn.ModuleList([nn.ReLU() for i in range(self.num_hidden_layers)])
        # output layer
        self.linear_end = nn.Linear(self.hidden_layers_size, self.embedding_size)

    def forward(self, x):
        x = self.linear_start(x)
        x = self.relu_start(x)
        for i in range(self.num_hidden_layers):
            x = self.hidden_layers[i](x)
            x = self.hidden_layers_activation[i](x)
        x = self.linear_end(x)
        return x
    

# PPI Predictor takes two protein embeddings and predicts the probability of interaction
class PPI_Predictor(nn.Module):
    def __init__(self, embedding_size, hidden_layers_size, num_hidden_layers):
        super(PPI_Predictor, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_layers_size = hidden_layers_size
        self.num_hidden_layers = num_hidden_layers

        # input layer
        self.linear_start = nn.Linear(2*self.embedding_size, self.hidden_layers_size)
        self.relu_start = nn.ReLU()
        # hidden layers and relu activation
        self.hidden_layers = nn.ModuleList([nn.Linear(self.hidden_layers_size, self.hidden_layers_size) for i in range(self.num_hidden_layers)])
        self.hidden_layers_activation = nn.ModuleList([nn.ReLU() for i in range(self.num_hidden_layers)])
        # output layer
        self.linear_end = nn.Linear(self.hidden_layers_size, 1)
        self.sigmoid_end = nn.Sigmoid()

    def forward(self, x):
        x = self.linear_start(x)
        x = self.relu_start(x)
        for i in range(self.num_hidden_layers):
            x = self.hidden_layers[i](x)
            x = self.hidden_layers_activation[i](x)
        x = self.linear_end(x)
        x = self.sigmoid_end(x)
        return x
    

# PPI Model combines the Protein Encoder and PPI Predictor
class PPI_Model(nn.Module):
    def __init__(self, input_feature_size, hidden_layers_size, num_hidden_layers, embedding_size):
        super(PPI_Model, self).__init__()
        self.protein_encoder = Protein_Encoder(input_feature_size, hidden_layers_size, num_hidden_layers, embedding_size)
        self.ppi_predictor = PPI_Predictor(embedding_size, hidden_layers_size, num_hidden_layers)

    def forward(self, x):
        x1 = x[0]
        x2 = x[1]
        x1 = self.protein_encoder(x1)
        x2 = self.protein_encoder(x2)
        x = torch.cat((x1, x2), 1)
        x = self.ppi_predictor(x)
        return x
    
    def save_model(self, path):
        torch.save(self.state_dict(), path)
    
    def load_model(self, path):
        self.load_state_dict(torch.load(path))

In [4]:
import  json

# Data Loader
with open("../protein_properties/protein_props.json", "r") as f:
    protein_props = json.load(f)

In [5]:
druggable_data = "../drugbank/druggable_proteins.txt"
with open(druggable_data) as f:
  druggable_proteins = [line.strip() for line in f]
print(len(druggable_proteins))

approved_drugs = "../drugbank/approved_druggable_proteins.txt"
with open(approved_drugs) as g:
  approved_proteins = [line.strip() for line in g]
print(len(approved_proteins))

3345
2652


In [6]:
protein_props_data = {}
for protein in protein_props:
    feature_vector = []
    feature_vector.append(protein_props[protein]["Molecular Weight"])
    feature_vector.append(protein_props[protein]["Sequence Length"])
    feature_vector.append(protein_props[protein]["GRAVY"])
    for k, v in protein_props[protein]["Amino Acid Percent"].items():
        feature_vector.append(v)
    feature_vector.append(protein_props[protein]["Molar Extinction Coefficient"][0])
    feature_vector.append(protein_props[protein]["Molar Extinction Coefficient"][1])
    feature_vector.append(protein_props[protein]["Isoelectric Point"])
    feature_vector.append(protein_props[protein]["Aromaticity"])
    feature_vector.append(protein_props[protein]["Instability Index"])
    for i in protein_props[protein]["Secondary Structure"]:
        feature_vector.append(i)
    protein_props_data[protein] = feature_vector

len(protein_props_data), len(protein_props_data["P05067"])

(20434, 31)

In [7]:
# PPI data
interaction_data = {}
for protein in protein_props_data:
    with open(f"../PPIs/PPI_indiv/{protein}.json", "r") as f:
        interactions = json.load(f)
    for interaction in interactions[protein]:
        protein1 = interaction["entry1"]
        protein2 = interaction["entry2_id"]
        if (protein1, protein2) in interaction_data:
            assert(interaction_data[(protein1, protein2)] == interaction["interaction_type"])
        elif (protein2, protein1) in interaction_data:
            assert(interaction_data[(protein2, protein1)] == interaction["interaction_type"])
        else:
            interaction_data[(protein1, protein2)] = interaction["interaction_type"]
        
len(interaction_data)

90606

In [8]:
# number of xeno interactions and binary interactions
xeno_interactions = 0
binary_interactions = 0
for interaction in interaction_data:
    if interaction_data[interaction] == "xeno":
        xeno_interactions += 1
    elif interaction_data[interaction] == "binary":
        binary_interactions += 1

xeno_interactions, binary_interactions

(3405, 87201)

In [9]:
# Data cleaning
# if any of the proteins in not in protein_props_data, remove the interaction
final_interaction_data = {}
for interaction in interaction_data:
    if interaction[0] in protein_props_data and interaction[1] in protein_props_data:
        final_interaction_data[interaction] = interaction_data[interaction]

interaction_data = final_interaction_data.copy()
len(interaction_data)

60614

In [10]:
# Dataset is PCFs of protein pairs, target is interaction type (3 classes -- 3 neurons in output layer) -- binary, xeno, none
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class PPI_Dataset(Dataset):
    def __init__(self, protein_props_data, interaction_data, druggable_proteins, approved_proteins):
        self.protein_props_data = protein_props_data
        self.interaction_data = interaction_data
        self.druggable_proteins = druggable_proteins
        self.proteins = list(protein_props_data.keys())
        self.non_druggable_proteins = [protein for protein in self.proteins if protein not in druggable_proteins]

        self.protein_pairs = list(self.interaction_data.keys())
        self.targets = [self.interaction_data[protein_pair] for protein_pair in self.protein_pairs]
        self.target_dict = {"binary": 0, "xeno": 1, "none": 2}
    
    def __len__(self):
        return len(self.protein_pairs)
    
    def __getitem__(self, idx):
        protein1, protein2 = self.protein_pairs[idx]
        protein1_data = self.protein_props_data[protein1]
        protein2_data = self.protein_props_data[protein2]
        target = self.target_dict[self.targets[idx]]
        # tensorize
        protein1_data = torch.tensor(protein1_data)
        protein2_data = torch.tensor(protein2_data)
        target = torch.tensor(target)
        return (protein1_data, protein2_data), target

In [11]:
# Train, Test split stratify and shuffle
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np

protein_pairs = list(interaction_data.keys())
targets = [interaction_data[protein_pair] for protein_pair in protein_pairs]
protein_pairs, targets = shuffle(protein_pairs, targets, random_state=42)

train_protein_pairs, test_protein_pairs, train_targets, test_targets = train_test_split(protein_pairs, targets, test_size=0.2, stratify=targets, random_state=42)

len(train_protein_pairs), len(test_protein_pairs)

(48491, 12123)

In [12]:
train_data = PPI_Dataset(protein_props_data, dict(zip(train_protein_pairs, train_targets)), druggable_proteins, approved_proteins)
test_data = PPI_Dataset(protein_props_data, dict(zip(test_protein_pairs, test_targets)), druggable_proteins, approved_proteins)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [13]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = PPI_Model(31, 80, 3, 20)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs = [inp.to(device) for inp in inputs]
        labels = labels.to(device)
        optimizer.zero_grad()
        exit()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss}")
    # validation loss
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            inputs, labels = data
            inputs = [inp.to(device) for inp in inputs]
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            val_loss += loss.item()
    print(f"Validation Loss: {val_loss}")

model.save_model("PPI_model.pth")



cpu
Epoch 1, Loss: 0.0
Validation Loss: 0.0
Epoch 2, Loss: 0.0
Validation Loss: 0.0


In [82]:

model.eval()

with torch.no_grad():
    output = model([torch.tensor(protein_props_data[random_protein_pair[0]]).to(device), torch.tensor(protein_props_data[random_protein_pair[1]]).to(device)])
    print(output)
    print(torch.argmax(output))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)