In [1]:
#Import necessary packages
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split 

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('csv_files/finalized_complete_NLS_sequence_table.csv')

In [7]:
df

Unnamed: 0,UniProt ID,Sequence_full,Name,Begin,End,Sequence_nls,Length,Evidence,ECO code
0,Q14738,MPYKLKKEKEPPKVAKCTAKPSSSGKDGGGENTEEAQPQPQPQPQP...,Serine/threonine-protein phosphatase 2A 56 kDa...,548,565,KRTVETEAVQMLKDIKKE,18,Sequence Analysis,ECO:0000255
1,Q13362,MLTCNKAGSRMVVDAANSNGPFQPVVLLHIRDVPPADQEKLFIQKL...,Serine/threonine-protein phosphatase 2A 56 kDa...,416,422,KLKEKLK,7,Sequence Analysis,ECO:0000255
2,Q9NRA8,MDRRSMGETESGDAFLDLKKPPASKCPHRYTKEELLDIKELPHSKQ...,Eukaryotic translation initiation factor 4E tr...,195,211,RREFGDSKRVFGERRRN,17,,
3,P42684,MGQQVGRVGEAPGLQQPQPRGIRGSSAARPSGRRRDPAGRTTETGF...,Abelson tyrosine-protein kinase 2,658,660,KKR,3,Sequence Analysis,ECO:0000255
4,Q4JIM5,MGQQVGRVGEAPGLQQPQPRGIRGSSAARPSGRRRDPAGRTADAGF...,Abelson tyrosine-protein kinase 2,659,661,KKR,3,Sequence Analysis,ECO:0000255
...,...,...,...,...,...,...,...,...,...
1358,Q96CK0,MAERALEPEAEAEAEAGAGGEAAAEEGAAGRKARGRPRLTESDRAR...,Zinc finger protein 653,107,118,PKKPKRKKRRRR,12,Sequence Analysis,ECO:0000255
1359,Q96CK0,MAERALEPEAEAEAEAGAGGEAAAEEGAAGRKARGRPRLTESDRAR...,Zinc finger protein 653,445,451,EPEKRRR,7,Sequence Analysis,ECO:0000255
1360,Q24JY4,MVEKKTSVRSQDPGQRRVLDRAARQRRINRQLEALENDNFQDDPHA...,Zinc finger HIT domain-containing protein 1,38,47,DNFQDDPHAG,10,By similarity,ECO:0000250
1361,O43257,MVEKKTSVRSQDPGQRRVLDRAARQRRINRQLEALENDNFQDDPHA...,Zinc finger HIT domain-containing protein 1,38,47,DNFQDDPHAG,10,,


In [3]:
# Define a function to generate motif features
def generate_motif_features(sequences, motifs):
    motif_features = []
    for seq in sequences:
        seq_features = []
        for i in range(len(seq)):
            # Check if position i is within any of the motifs
            motif_present = any(start <= i < end for start, end in motifs)
            seq_features.append(int(motif_present))
        motif_features.append(seq_features)
    return motif_features

In [4]:
# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('Rostlab/prot_bert')
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert')

In [5]:
# Preprocessing function
def preprocess_sequences(sequences, tokenizer):
    tokenized_sequences = tokenizer.batch_encode_plus(sequences, padding=True, truncation=True, max_length=5890, return_tensors='pt')
    return tokenized_sequences['input_ids'], tokenized_sequences['attention_mask']


In [6]:
# Define a simple GCN model
class GCNModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(input_size, hidden_size)
        self.conv2 = GCNConv(hidden_size, hidden_size)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

In [9]:
# Define the combined BERT-GCN model
class BERT_GCN_Model(nn.Module):
    def __init__(self, bert_model, gcn_input_size, gcn_hidden_dim, num_classes):
        super(BERT_GCN_Model, self).__init__()
        self.bert = bert_model
        self.gcn = GCNModel(gcn_input_size, gcn_hidden_dim)
        self.linear = nn.Linear(gcn_hidden_dim, num_classes)

    def forward(self, input_ids, attention_mask, edge_index, motif_features):
        # BERT encoding
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        # Concatenate BERT embeddings with motif features
        combined_features = torch.cat((bert_output, motif_features), dim=-1)
        # GCN layer
        gcn_output = self.gcn(combined_features, edge_index)
        # Linear layer
        output = self.linear(gcn_output)
        return output

In [10]:
def sequence_based_edge_index(sequence_length):
    edge_index = [[i, i + 1] for i in range(sequence_length - 1)]
    edge_index = np.array(edge_index).T
    return edge_index

In [25]:
# Split data into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [13]:
# Generate motif features for training sequences based off of columns in the CSV file
#train_motifs = list(zip(train_df['Begin'], train_df['End']))
#train_sequences = train_df['Sequence_full']
#train_motif_features = generate_motif_features(train_sequences, train_motifs)


In [11]:
# Split data into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Generate motif features for training sequences based off of columns in the CSV file
train_motifs = list(zip(train_df['Begin'], train_df['End']))
train_sequences = train_df['Sequence_full']
train_motif_features = generate_motif_features(train_sequences, train_motifs)

# Generate motif features for validation sequences based off of columns in the CSV file
val_motifs = list(zip(val_df['Begin'], val_df['End']))
val_sequences = val_df['Sequence_full']
val_motif_features = generate_motif_features(val_sequences, val_motifs)

# Generate motif features for test sequences based off of columns in the CSV file
test_motifs = list(zip(test_df['Begin'], test_df['End']))
test_sequences = test_df['Sequence_full']
test_motif_features = generate_motif_features(test_sequences, test_motifs)

# Tokenization and Preprocessing with fixed length
train_input_ids, train_attention_mask = preprocess_sequences(train_sequences, tokenizer)
val_input_ids, val_attention_mask = preprocess_sequences(val_sequences, tokenizer)
test_input_ids, test_attention_mask = preprocess_sequences(test_sequences, tokenizer)

# Calculate sequence lengths explicitly from the raw sequence strings
train_sequence_lengths = [len(seq) for seq in train_sequences]
val_sequence_lengths = [len(seq) for seq in val_sequences]
test_sequence_lengths = [len(seq) for seq in test_sequences]

In [47]:
# Generate GCN edge index for training sequences
train_edge_index = [sequence_based_edge_index(len(seq)) for seq in train_sequences]
print("Train sequence lengths:", [len(seq) for seq in train_sequences])
print("Train edge indices:", train_edge_index)
print("Train motif features shape:", torch.tensor(train_motif_features).shape)

# Generate GCN edge index for validation sequences
val_edge_index = [sequence_based_edge_index(len(seq)) for seq in val_sequences]
print("Validation sequence lengths:", [len(seq) for seq in val_sequences])
print("Validation edge indices:", val_edge_index)
print("Validation motif features shape:", torch.tensor(val_motif_features).shape)

# Generate GCN edge index for test sequences
test_edge_index = [sequence_based_edge_index(len(seq)) for seq in test_sequences]
print("Test sequence lengths:", [len(seq) for seq in test_sequences])
print("Test edge indices:", test_edge_index)
print("Test motif features shape:", torch.tensor(test_motif_features).shape)

Train sequence lengths: [1203, 526, 364, 890, 893, 588, 1013, 325, 242, 341, 419, 503, 1302, 789, 563, 961, 294, 795, 458, 994, 892, 777, 198, 165, 596, 910, 2002, 724, 754, 533, 355, 5890, 1435, 788, 194, 737, 962, 322, 718, 926, 934, 175, 657, 1046, 588, 484, 342, 1103, 384, 933, 893, 822, 755, 1416, 257, 823, 490, 747, 177, 785, 845, 377, 1203, 315, 240, 484, 729, 657, 406, 727, 727, 140, 782, 201, 1234, 489, 536, 490, 837, 760, 484, 640, 694, 567, 370, 902, 469, 165, 191, 651, 563, 365, 397, 316, 567, 1216, 626, 283, 377, 655, 1014, 1075, 851, 332, 288, 1069, 331, 317, 194, 364, 619, 910, 469, 166, 1001, 166, 225, 165, 978, 473, 343, 933, 640, 529, 397, 294, 1622, 327, 393, 269, 370, 521, 394, 301, 341, 355, 1889, 433, 1287, 568, 565, 639, 132, 642, 727, 613, 1070, 961, 82, 1593, 166, 690, 587, 913, 317, 1016, 1611, 170, 356, 339, 317, 227, 549, 596, 460, 337, 465, 329, 588, 653, 410, 487, 193, 165, 394, 653, 165, 530, 737, 394, 394, 846, 770, 332, 717, 727, 370, 651, 610, 105, 671

ValueError: expected sequence of length 1203 at dim 1 (got 526)

In [41]:
batch_size= 8
# Define DataLoader for training, validation, and test sets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, torch.tensor(train_edge_index), torch.tensor(train_motif_features[0]))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

AssertionError: Size mismatch between tensors

In [30]:
val_dataset = TensorDataset(val_input_ids, val_attention_mask, torch.tensor(val_edge_index), torch.tensor(val_motif_features[0]))
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TensorDataset(test_input_ids, test_attention_mask, torch.tensor(test_edge_index), torch.tensor(test_motif_features[0]))
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define labels for training, validation, and test sets
train_labels = torch.ones(len(train_df), dtype=torch.float32)
val_labels = torch.ones(len(val_df), dtype=torch.float32)
test_labels = torch.ones(len(test_df), dtype=torch.float32)

ValueError: expected sequence of length 1202 at dim 2 (got 525)

In [1]:
# Define the model, set number of classes to 2 for a binary classification
model = BERT_GCN_Model(bert_model, bert_model.config.hidden_size + len(train_motif_features[0]), gcn_hidden_dim=64, num_classes=2)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Define num_classes
num_classes = 2  # Use binary classification

# Training loop
# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_train_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, edge_index, motif_features = batch
        outputs = model(input_ids, attention_mask, edge_index, motif_features)
        training_losses = criterion(outputs, train_labels)
        training_losses.backward()
        optimizer.step()
        epoch_train_loss += training_losses.item() * input_ids.size(0)  # Accumulate the loss


  # Validation loop
    model.eval()
    epoch_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, edge_index, motif_features = batch
            val_outputs = model(input_ids, attention_mask, edge_index, motif_features)
            val_losses = criterion(val_outputs, val_labels)
            epoch_val_loss += val_losses.item() * input_ids.size(0)  # Accumulate the loss
            # Calculate validation metrics

# Test loop
model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, edge_index, motif_features = batch
        test_outputs = model(input_ids, attention_mask, edge_index, motif_features)
        # Calculate test metrics
        test_predictions = torch.sigmoid(test_outputs) > 0.5  # Threshold for binary predictions

# Save model parameters
torch.save(model.state_dict(), 'model.pth')

# Save tokenizer
tokenizer.save_pretrained('tokenizer')

# Save predictions to CSV file
predictions_df = pd.DataFrame(test_predictions.numpy(), columns=['predicted_label_{}'.format(i) for i in range(num_classes)])
predictions_df.to_csv('NLS_predictions.csv', index=False)

# Save training and validation losses
# Assuming training_losses and validation_losses are calculated during training
with open('NLS_training_losses.txt', 'w') as f:
    for loss in training_losses:
        f.write('{}\n'.format(loss))

with open('NLS_validation_losses.txt', 'w') as f:
    for loss in val_losses:
        f.write('{}\n'.format(loss))


NameError: name 'pd' is not defined