---
### Imports

---

In [2]:
import pandas as pd

from tqdm.notebook import tqdm,trange

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AdamW


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix , classification_report 

from tqdm.notebook import tqdm,trange

import random
import numpy as np
import os



import pandas as pd 
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from torch.utils.data import Dataset


In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed = 12)

---
### Preprocessing

---

In [5]:
input_features = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K','L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
data = pd.read_csv("/kaggle/input/deep-learning-for-msc-202324/labels_train.csv")
data.head()

Unnamed: 0,PDB_ID,SEC_STRUCT
0,1VBK_1_A,CCEEEEEECCCCCCCCCCHHHHHHHHHHHHHHHHHHCCCCCCEEEE...
1,1QGV_1_A,CCCCCCECCCHHHHHHHHHCCCCCEEEEEEECCCCHHHHHHHHHHH...
2,1KX6_1_A,CCCCCCCCCCCCCCCCCCCCCCHHHHCCC
3,1V88_1_A,CCCCCCCCCEEEEEEECCCCCCCEEEEEEEECCEEEEECCCCCCCC...
4,1ZY8_2_K,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...


In [6]:
data = pd.read_csv("/kaggle/input/deep-learning-for-msc-202324/labels_train.csv")
# average length of SEC_STRUCT entries
total_length = data['SEC_STRUCT'].apply(lambda x: len(x)).sum()
average_length = data['SEC_STRUCT'].apply(lambda x: len(x)).mean()

print(f"Average Number of Characters in SEC_STRUCT: {average_length}")
print(f"Total Number of Characters in SEC_STRUCT: {total_length}")

Average Number of Characters in SEC_STRUCT: 223.48713286713286
Total Number of Characters in SEC_STRUCT: 1597933


In [7]:
target_map = {
    "C": 0,
    "E": 1,
    "H": 2,
}

In [8]:
from tqdm import trange
from tqdm.notebook import tqdm

# Assuming `data`, `input_features`, `target_map`, and `seq_path` are predefined
seq_path = "/kaggle/input/deep-learning-for-msc-202324/train"

# Initialize an empty list for collecting DataFrames
dfs = []
# Using trange for a progress bar in a Jupyter Notebook environment
for i in trange(len(data["PDB_ID"].unique()), desc='Processing PDB files'):
    pdb_id = data["PDB_ID"].unique()[i]
    pdb_path = f"{seq_path}/{pdb_id}_train.csv"
    pdb_data = pd.read_csv(pdb_path)
    
    # Filter to the desired input features
    pdb_data = pdb_data[input_features].copy()
    
    # Extracting the SEC_STRUCT for the current pdb_id
    # Ensuring the selection is safe by resetting index and accessing the first row
    sec_struct = data.loc[data["PDB_ID"] == pdb_id, "SEC_STRUCT"].reset_index(drop=True)[0]
    
    # Mapping SEC_STRUCT characters to target values
    targets = [target_map[sec] for sec in list(sec_struct)]
    
    # Assigning targets and PDB_ID to the DataFrame
    pdb_data["targets"] = targets
    pdb_data["PDB_ID"] = pdb_id
    
    # Append the modified DataFrame to the list
    dfs.append(pdb_data)

# Concatenate all collected DataFrames into complete_df
complete_df = pd.DataFrame()
complete_df = pd.concat(dfs, ignore_index=True)


Processing PDB files: 100%|██████████| 7150/7150 [01:21<00:00, 88.10it/s] 


In [9]:
# Corrected version of the first snippet
test_data = []  # Renamed to avoid confusion with testfile
testfile = pd.read_csv("/kaggle/input/deep-learning-for-msc-202324/seqs_test.csv")
testfile = testfile.rename(columns={0: "PDB_ID", 1: "SEQUENCE"})

for i in trange(len(testfile)):
    row = testfile.iloc[i]
    path = f"/kaggle/input/deep-learning-for-msc-202324/test/{row['PDB_ID']}_test.csv"
    file_read = pd.read_csv(path)
    features = file_read[input_features].copy()  
    features["PDB_ID"] = row["PDB_ID"]
    features["ID"] = [f"{row['PDB_ID']}_{num}" for num in file_read["RES_NUM"]]

    for x, val in features.iterrows(): 
        test_data.append(val.to_dict())

test_df = pd.DataFrame(test_data)


100%|██████████| 205/205 [00:05<00:00, 37.10it/s]


In [10]:
len(complete_df)

1597933

In [11]:
complete_df.iloc[300:310]

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,Y,targets,PDB_ID
300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14191,0.0,0.523431,...,0.0,0.0,0.0,0.0,0.0,0.061008,0.0,0.0,2,1VBK_1_A
301,0.006024,0.0,0.0,0.040448,0.0,0.0,0.109294,0.0,0.233219,0.0,...,0.0,0.126506,0.277108,0.034423,0.011188,0.0,0.0,0.0,2,1VBK_1_A
302,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1VBK_1_A
303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,2,1VBK_1_A
304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1VBK_1_A
305,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1VBK_1_A
306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1VBK_1_A
307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1QGV_1_A
308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.932635,0.0,0.0,0.0,0.067365,0,1QGV_1_A
309,0.017991,0.0,0.0,0.0,0.11994,0.0,0.034483,0.02099,0.0,0.035982,...,0.0,0.0,0.0,0.0,0.0,0.148426,0.0,0.622189,0,1QGV_1_A


In [12]:
#train_main_df , test_main_df = train_test_split(main_df, test_size = 0.2, random_state = 12)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(complete_df, test_size=0.2, stratify=complete_df['targets'], random_state=12)

In [13]:
train_df.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,Y,targets,PDB_ID
1135981,0.086554,0.006013,0.02642,0.026025,0.002563,0.016167,0.033222,0.005718,0.02711,0.017843,...,0.491916,0.003155,0.024645,0.059937,0.119282,0.003647,0.001577,0.001282,1,1FFV_3_C
784193,0.002703,0.000676,0.163722,0.098175,0.000483,0.222705,0.031856,0.0,0.048074,0.003089,...,0.035718,0.009943,0.072304,0.027126,0.0,0.0,0.0,0.16295,0,1S5J_1_A
1196898,0.021587,0.005864,0.0,0.0,0.066759,0.005994,0.00291,0.312383,0.000782,0.290275,...,0.001477,0.000608,0.000174,0.000261,0.007123,0.209008,0.02719,0.010555,2,1R2J_1_A
165708,0.066205,0.0,0.062102,0.002971,0.398076,0.021644,0.0,0.0,0.011176,0.057717,...,0.0,0.0174,0.03678,0.087283,0.086434,0.004385,0.0,0.002263,0,1Y6K_2_R
109075,0.006175,0.0013,0.005633,0.023616,0.258044,0.004767,0.010183,0.037157,0.004983,0.118189,...,0.007041,0.00455,0.010616,0.045391,0.020583,0.036616,0.271693,0.06879,1,2AD1_1_A


In [14]:
val_df.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,Y,targets,PDB_ID
300836,0.035529,0.003089,0.010308,0.0,0.163223,0.001144,0.002103,0.068415,0.00026,0.599176,...,0.001842,0.000223,6.5e-05,0.0016,0.001582,0.03765,0.011192,0.010568,2,1BN7_1_A
1422196,0.524815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006336,0.0,...,0.0,0.009504,0.0,0.459345,0.0,0.0,0.0,0.0,0,1OKG_1_A
415530,0.103286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.788732,0.0,0.0,0.0,0.0,0.107981,0.0,0.0,0,1U2X_1_A
855391,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1IJ6_1_A
381915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.864407,0.135593,0.0,0.0,0.0,0,1L3A_1_A


In [15]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class Final_Dataset(Dataset):
    def __init__(self, df, input_features= input_features, is_test=False, fixed_length=1200):
        super(Final_Dataset, self).__init__()
        self.df = df.reset_index(drop=True).groupby("PDB_ID")
        self.group_keys = list(self.df.groups.keys())
        self.input_features = input_features
        self.is_test = is_test
        self.fixed_length = fixed_length

    def truncate_pad(self, data):
        length = len(data)
        pdb_id = data['PDB_ID'].iloc[0] if not data.empty else "Empty Dataset"
        
        # Pad the data if it's shorter than the fixed length
        if length < self.fixed_length:
            pad_rows = self.fixed_length - length
            # Create a DataFrame for padding with -1, and replicate pdb_id for each row
            pad_df = pd.DataFrame(-1, index=range(pad_rows), columns=self.input_features)
            pad_df['PDB_ID'] = pdb_id
            # add targets if it's not test data
            if not self.is_test: 
                pad_df['targets'] = -1
            # Concatenate original data with the padding DataFrame
            data = pd.concat([data, pad_df], ignore_index=True)
        
        # Truncate the data if it's longer than the fixed length
        elif length > self.fixed_length:
            data = data.iloc[:self.fixed_length]
        
        return data

    def __getitem__(self, idx):
        group_key = self.group_keys[int(idx)]
        data = self.df.get_group(group_key)

        data = self.truncate_pad(data)  # Use the integrated method
        inputs = torch.tensor(data[self.input_features].values, dtype=torch.float32)
        
        if not self.is_test:
            targets = torch.tensor(data["targets"].values, dtype=torch.long)
            return inputs, targets
        else:
            return inputs

    def __len__(self):
        return len(self.group_keys)


In [16]:
train_ds = Final_Dataset(train_df)
val_ds = Final_Dataset(val_df)
test_ds = Final_Dataset(test_df, is_test = True)

train_loader = DataLoader(train_ds, 
                          batch_size=64, 
                          shuffle=True, 
                          num_workers=4,
                          pin_memory=False, 
                          drop_last=True)
val_loader = DataLoader(val_ds, 
                          batch_size=64, 
                          shuffle=True, 
                          num_workers=4,
                          pin_memory=False, 
                          drop_last=False)
test_loader = DataLoader(test_ds, 
                        batch_size= 64,
                        shuffle=False, 
                        num_workers=4,
                        pin_memory=False, 
                        drop_last=False)

---
### Model Structure
---

In [17]:
torch.cuda.empty_cache()

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SeqProteinUNet(nn.Module):
    def __init__(self):
        super(SeqProteinUNet, self).__init__()
        # first layer of convolution
        self.layer_conv1 = nn.Conv2d(1, 64, kernel_size=(5, 5), stride=1, padding=2)
        # Second convolution layer
        self.layer_conv2 = nn.Conv2d(64, 128, kernel_size=(5, 5), stride=1, padding=2)
        # Third convolution layer
        self.layer_conv3 = nn.Conv2d(128, 256, kernel_size=(5, 5), stride=1, padding=2)
        
        # Bottleneck features with ReLU
        self.core_bottleneck = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=1, padding=1),
            nn.ReLU(inplace=True)
        )

        # Upsampling layers
        self.layer_upconv1 = nn.ConvTranspose2d(256, 128, kernel_size=(5, 5), stride=1, padding=2)
        self.layer_upconv2 = nn.ConvTranspose2d(128, 64, kernel_size=(5, 5), stride=1, padding=2)
        self.final_layer = nn.ConvTranspose2d(64, 3, kernel_size=(1, 1), stride=1)

    def forward(self, input_tensor):
        # Add a channel dimension
        input_tensor = input_tensor.unsqueeze(1)

        # Encoding path
        encode_path1 = F.relu(self.layer_conv1(input_tensor))
        encode_path2 = F.relu(self.layer_conv2(encode_path1))
        encode_path3 = F.relu(self.layer_conv3(encode_path2))
        
        # Process through the bottleneck
        bottleneck_output = self.core_bottleneck(encode_path3)

        # Decoding path with skip connections
        decode_path1 = F.relu(self.layer_upconv1(bottleneck_output)) + encode_path2  
        decode_path2 = F.relu(self.layer_upconv2(decode_path1)) + encode_path1  
        decoded_output = self.final_layer(decode_path2)  

        # Average pooling at the end
        final_output = torch.mean(decoded_output, dim=-1)

        return final_output


In [20]:

import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F

def load_dataset(dataset):
#    Loads dataset into memory,GPU if available.
    if torch.cuda.is_available():
        return (dataset[0].to("cuda"), dataset[1].to("cuda"))
    else:
        return dataset

def calculate_accuracy(predictions, labels):
    valid_labels_mask = labels != -1
    predicted_labels = torch.max(predictions, 1)[1]
    correct_predictions = (predicted_labels == labels) & valid_labels_mask
    accuracy = correct_predictions.sum().float() / valid_labels_mask.sum().float()
    return accuracy.item()

def train(model, data_loader, validation_loader):
    model_parameters = list(model.named_parameters())
    params_without_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimized_parameters = []

    # Parameters with weight decay
    params_with_decay = []
    for name, param in model_parameters:
        if not any(nd in name for nd in params_without_decay):
            params_with_decay.append(param)

    # Parameters without weight decay
    params_no_decay = []
    for name, param in model_parameters:
        if any(nd in name for nd in params_without_decay):
            params_no_decay.append(param)

    optimized_parameters.append({'params': params_with_decay, 'weight_decay': 0.01})
    optimized_parameters.append({'params': params_no_decay, 'weight_decay': 0.0})
    
    
    
    
    
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = torch.optim.AdamW(optimized_parameters, lr=0.002)

    training_losses = []
    training_accuracies = [] 
    validation_losses = [] 
    validation_accuracies = []
    predicted_outcomes = []
    actual_labels = []

    for epoch in range(15):
        model.train()
        cumulative_loss = 0.0
        correct_predictions = 0 
        total_samples = 0
        
        with tqdm(data_loader, desc=f'Epoch {epoch+1}/10', unit='batch') as progress_bar:
            for batch_index, batch_data in enumerate(progress_bar):
                inputs, labels = load_dataset(batch_data)
                optimizer.zero_grad()
                outputs = model(inputs)
                batch_loss = criterion(outputs, labels)
                batch_loss.backward()
                optimizer.step()

                cumulative_loss += batch_loss.item()
                batch_accuracy = calculate_accuracy(outputs, labels)
                correct_predictions += batch_accuracy * labels[labels != -1].size(0)
                total_samples += labels[labels != -1].size(0)
                
                progress_bar.set_postfix(loss=f'{(cumulative_loss/(batch_index+1)):.2f}', accuracy=f'{(correct_predictions / total_samples) * 100:.2f}%')

        avg_training_loss = cumulative_loss / len(data_loader)
        avg_training_accuracy = correct_predictions / total_samples
        training_losses.append(avg_training_loss)
        training_accuracies.append(avg_training_accuracy)

        model.eval()
        validation_loss = 0.0
        validation_corrects = 0
        validation_total =  0
        with torch.no_grad():
            for batch_data in validation_loader:
                inputs, labels = load_dataset(batch_data)
                outputs = model(inputs)
                batch_loss = criterion(outputs, labels)
                validation_loss += batch_loss.item()

                batch_accuracy = calculate_accuracy(outputs, labels)
                validation_corrects += batch_accuracy * labels[labels != -1].size(0)
                validation_total += labels[labels != -1].size(0)

                if epoch == 1:  # Collecting predictions and labels for further analysis
                    predicted_outcomes.extend(outputs.cpu().detach().numpy())
                    actual_labels.extend(labels.cpu().detach().numpy())

        avg_validation_loss = validation_loss / len(validation_loader)
        avg_validation_accuracy = validation_corrects / validation_total
        validation_losses.append(avg_validation_loss)
        validation_accuracies.append(avg_validation_accuracy)

        print(f'End of Epoch {epoch+1}: Training Loss: {avg_training_loss:.2f}, Training Accuracy: {avg_training_accuracy * 100:.2f}%, '
              f'Validation Loss: {avg_validation_loss:.2f}, Validation Accuracy: {avg_validation_accuracy * 100:.2f}%')

    training_metrics = [training_losses, training_accuracies, validation_losses, validation_accuracies]
    return model, training_metrics, np.array(predicted_outcomes), np.array(actual_labels)


In [21]:
model = SeqProteinUNet()

if torch.cuda.is_available():
    model = model.to("cuda")

model, stats, y_preds, y_labels = train(model, train_loader, val_loader)

Epoch 1/10: 100%|██████████| 111/111 [04:30<00:00,  2.44s/batch, accuracy=40.76%, loss=1.62]


End of Epoch 1: Training Loss: 1.62, Training Accuracy: 40.76%, Validation Loss: 1.06, Validation Accuracy: 44.06%


In [25]:
def process_labels(label, prediction):
    # Flatten the arrays
    label = label.flatten()
    

    if prediction.ndim == 2:
        prediction = prediction.argmax(1).flatten()
    else:
        prediction = prediction.flatten()
    
    mask = label != -1
    
    label_unmasked = label[mask]
    prediction_unmasked = prediction[mask]
    
    return label_unmasked, prediction_unmasked




In [28]:
reverse_map = {
    0: "C",
    1: "E",
    2: "H",
}

---
# Output

---

In [32]:
def final_test(model, test_loader):
    combined_out = []
    model.eval()
    with torch.no_grad():
        for inputs in test_loader:
            if torch.cuda.is_available():
                inputs = inputs.to("cuda")
            outputs = model(inputs)

            combined_out.append(outputs.argmax(1))
    return torch.concat(combined_out).cpu().numpy()

def generate_submission(test_df, test_preds, sample_sub_path = "/kaggle/input/comp-data/sample.csv"):
    # by keys getting the PBD_ID's and 
    groupby_df = test_df.groupby("PDB_ID")
    ids = []
    targets = []
    for (idx, key) in enumerate(list(groupby_df.groups.keys())):
        df = groupby_df.get_group(key)
        ids.extend(f"{key}_{i}" for i in range(1, 1 + len(df)  ))
        targets.extend([test_preds[idx][:len(df)].tolist()][0])

    target_dict = {idd:target for idd,target in zip(ids, targets)}
    sample = pd.read_csv(sample_sub_path)
    #reversemapping the structures for prediction
    sample["STRUCTURE"] = sample["ID"].map(target_dict).map(reverse_map)
    return sample


test_preds = final_test(model, test_loader)
submission = generate_submission(test_df, test_preds,"/kaggle/input/deep-learning-for-msc-202324/sample.csv")
submission.to_csv("submission.csv", index = False)
submission.head()

Unnamed: 0,ID,STRUCTURE
0,2AIO_1_A_1,C
1,2AIO_1_A_2,C
2,2AIO_1_A_3,C
3,2AIO_1_A_4,C
4,2AIO_1_A_5,C
