In [3]:
import json

data_file_path = "../PCFs/files_for_ml/protein_props/protein_props.json"

with open(data_file_path, 'r') as f:
    protein_data = json.load(f)

print("Total number of uniprot human verified proteins:", len(protein_data))


Total number of uniprot human verified proteins: 20434


In [4]:
max_seq_length = -1
min_seq_length = 1e10

for protein in protein_data:
  seq_length = len(protein_data[protein]['Sequence'])
  max_seq_length = max(max_seq_length, seq_length)
  min_seq_length = min(min_seq_length, seq_length)

print("Maximum Sequence Length:", max_seq_length)
print("Minimum Sequence Length:", min_seq_length)

Maximum Sequence Length: 34350
Minimum Sequence Length: 2


In [5]:
unique_amino_acids = set()

for protein in protein_data:
  seq = protein_data[protein]['Sequence']
  for aa in seq:
    unique_amino_acids.add(aa)

print("Number of Unique Amino Acids:", len(unique_amino_acids))

Number of Unique Amino Acids: 20


In [6]:
amino_acid_to_num_aa = {aa: i for i, aa in enumerate(sorted(unique_amino_acids))}
amino_acid_to_num_aa

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19}

In [8]:
MAX_SEQ_LENGTH = 3000

In [10]:
sequence_encoding = {}

for protein in protein_data:
  seq = protein_data[protein]['Sequence']
  if len(seq) <= MAX_SEQ_LENGTH:
    encoded_seq = [amino_acid_to_num_aa[aa] for aa in seq]
    sequence_encoding[protein] = encoded_seq
    while(len(sequence_encoding[protein]) <= MAX_SEQ_LENGTH):
      sequence_encoding[protein].extend(encoded_seq)
    sequence_encoding[protein] = sequence_encoding[protein][:MAX_SEQ_LENGTH]

In [13]:
# Build a neural reconstruction network, where the input is the protein sequence of MAX_SEQ_LENGTH
# and output is also same, latent layer is dimension 20

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Dataloader class
class MyDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    protein = list(self.data.keys())[idx]
    seq = self.data[protein]
    return torch.tensor(seq), torch.tensor(seq)

# Reconstruction Network
class ReconstructionNetwork(nn.Module):
  def __init__(self, latent_layer_dim):
    super(ReconstructionNetwork, self).__init__()
    self.latent_layer_dim = latent_layer_dim
    self.encoder = nn.Sequential(
        nn.Linear(MAX_SEQ_LENGTH, 128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, latent_layer_dim)
    )
    self.decoder = nn.Sequential(
        nn.Linear(latent_layer_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 128),
        nn.ReLU(),
        nn.Linear(128, MAX_SEQ_LENGTH)
    )

  def forward(self, x):
    latent = self.encoder(x)
    reconstructed = self.decoder(latent)
    return latent, reconstructed

In [16]:
def train_network(model, train_loader, optimizer, criterion, num_epochs):
  train_losses = []
  for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for seq, _ in train_loader:
      optimizer.zero_grad()
      _, reconstructed = model(seq.float())
      loss = criterion(reconstructed, seq.float())
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}")
    train_losses.append(train_loss/len(train_loader))

  return train_losses, model


In [18]:
trainloader = DataLoader(MyDataset(sequence_encoding), batch_size=500, shuffle=True)
model = ReconstructionNetwork(20)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
num_epochs = 10

trainlosses, model = train_network(model, trainloader, optimizer, criterion, num_epochs)

Epoch 1/10, Train Loss: 48.13909739982791
Epoch 3/10, Train Loss: 30.32687052284799
Epoch 4/10, Train Loss: 30.32943972145639
Epoch 5/10, Train Loss: 30.33499299026117
Epoch 6/10, Train Loss: 30.3258369259718
Epoch 7/10, Train Loss: 30.34180194575612
Epoch 8/10, Train Loss: 30.324753970634646
Epoch 9/10, Train Loss: 30.323198411522842
Epoch 10/10, Train Loss: 30.33403377998166


In [19]:
# COllect latent values
for protein in sequence_encoding:
  latent, _ = model(torch.tensor(sequence_encoding[protein]).float())
  sequence_encoding[protein] = latent.detach().numpy()

In [22]:
for protein in sequence_encoding:
  encoding = sequence_encoding[protein]
  encoding = {f"encoding_{i}":encoding[i] for i in range(len(encoding))}
  sequence_encoding[protein] = encoding


In [23]:
sequence_encoding["P05067"]

{'encoding_0': -17.60629,
 'encoding_1': -41.13669,
 'encoding_2': 42.415375,
 'encoding_3': -10.006809,
 'encoding_4': 44.50327,
 'encoding_5': 9.062079,
 'encoding_6': -11.936504,
 'encoding_7': -20.683659,
 'encoding_8': 3.984549,
 'encoding_9': 37.785206,
 'encoding_10': 12.472336,
 'encoding_11': -55.68788,
 'encoding_12': -19.370184,
 'encoding_13': -11.168778,
 'encoding_14': -13.122353,
 'encoding_15': 10.536164,
 'encoding_16': -36.11607,
 'encoding_17': -63.010822,
 'encoding_18': 20.33886,
 'encoding_19': -19.18953}

In [26]:
import pandas as pd

# save as csv
df = pd.DataFrame.from_dict(sequence_encoding)
df.to_csv("files_for_ml/latent_values.csv", index=False)