# 🚀 Installation Instructions

**Please make sure to run the following cells first** before proceeding with any other steps.  
This will set up all necessary dependencies and environment configurations for the project.


In [None]:
!wget https://github.com/SabrinaCesaroni/2nd-Hack-Nation-Global-AI-Hackathon/raw/refs/heads/main/protein_ss_classifier.pth

--2025-08-10 12:31:54--  https://github.com/SabrinaCesaroni/2nd-Hack-Nation-Global-AI-Hackathon/raw/refs/heads/main/protein_ss_classifier.pth
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/SabrinaCesaroni/2nd-Hack-Nation-Global-AI-Hackathon/refs/heads/main/protein_ss_classifier.pth [following]
--2025-08-10 12:31:55--  https://raw.githubusercontent.com/SabrinaCesaroni/2nd-Hack-Nation-Global-AI-Hackathon/refs/heads/main/protein_ss_classifier.pth
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5267338 (5.0M) [application/octet-stream]
Saving to: ‘protein_ss_classifier.pth’


2025-08-10 12:31:56 (4

In [None]:
! pip install esm -q
! pip install fair-esm -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m106.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import esm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
from IPython.display import display, Markdown
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
model_esm, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()
model_esm.eval()
model_esm = model_esm.to(device)
embedding_dim = model_esm.embed_tokens.embedding_dim


In [None]:
def q8_to_q3(q8_onehot):
    N, L, _ = q8_onehot.shape

    q3_onehot = np.zeros((N, L, 4))

    # Map H (Helix)
    q3_onehot[:, :, 0] = q8_onehot[:, :, 3] + q8_onehot[:, :, 4] + q8_onehot[:, :, 5]
    # Map E (Strand)
    q3_onehot[:, :, 1] = q8_onehot[:, :, 1] + q8_onehot[:, :, 2]
    # Map C (Coil)
    q3_onehot[:, :, 2] = q8_onehot[:, :, 0] + q8_onehot[:, :, 6] + q8_onehot[:, :, 7]
    # NoSeq
    q3_onehot[:, :, 3] = q8_onehot[:, :, 8]

    return q3_onehot
aa_order = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L',
            'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', 'NoSeq']
ss_order = ['H', 'E', 'C', 'NoSeq']

def onehot_to_string_with_padding(onehot_array, order, noseq_index, pad_char):
    indices = np.argmax(onehot_array, axis=2)
    N, L = indices.shape
    strings = []
    for i in range(N):
        seq_chars = []
        for idx in indices[i]:
            if idx == noseq_index:
                seq_chars.append(pad_char)  # Replace NoSeq by pad_char
            else:
                seq_chars.append(order[idx])
        strings.append(''.join(seq_chars))
    return strings



In [None]:
print(embedding_dim)

In [None]:
class ProteinSSClassifier(nn.Module):
    def __init__(self, esm_dim=320, hidden_dim=256, num_classes=3, dropout=0.3):
        super().__init__()
        self.bilstm = nn.LSTM(esm_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, esm_embeddings):
        x, _ = self.bilstm(esm_embeddings)   # (batch, seq_len, hidden_dim*2)
        return self.classifier(x)

print("✅ Classifier model defined")
print("Architecture: ESM embeddings -> Hidden layers -> 3 classes (H/E/C)")


In [None]:
# Reuse your ProteinSSClassifier class definition from above
# Make sure it's defined before loading

# 1. Load checkpoint
checkpoint = torch.load("protein_ss_classifier.pth", weights_only=False, map_location="cpu")

# 2. Extract saved config
cfg = checkpoint["model_config"]

# 3. Rebuild model using saved hyperparameters
classifier = ProteinSSClassifier(
    esm_dim=cfg["esm_dim"],
    hidden_dim=cfg["hidden_dim"],
    num_classes=cfg["num_classes"],
    dropout=cfg["dropout"]
)

# 4. Load weights
classifier.load_state_dict(checkpoint["model_state_dict"])
classifier.eval()

print("✅ Model loaded successfully")
print("Best validation accuracy:", checkpoint["training_history"]["best_val_acc"])


# Predictor

This software provides two ways to input protein sequences for secondary structure prediction:

1. **Simple String Input:**  
   Enter the amino acid sequence directly as a string of letters representing the protein chain.

2. **FASTA File Input:**  
   Upload protein sequences in FASTA format files, which include a header line followed by the sequence.

Choose the input method that best fits your data format.


In [None]:
# @title ##run **ESM-Secondary Structure Predictor** from sequence
# @markdown Insert the sequence here, then run the cells:
def predict_secondary_structure(sequence, model_esm, classifier, batch_converter, device):
    """Predict secondary structure for a single protein sequence"""
    classifier.eval()

    # Prepare sequence
    data = [("input_seq", sequence)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)

    with torch.no_grad():
        # Get ESM embeddings
        results = model_esm(batch_tokens, repr_layers=[model_esm.num_layers])
        embeddings = results["representations"][model_esm.num_layers]
        embeddings = embeddings[:, 1:-1, :]  # Remove special tokens

        # Get predictions
        outputs = classifier(embeddings)
        predictions = torch.argmax(outputs, dim=-1)

        # Convert to letters
        ss_map = {0: 'H', 1: 'E', 2: 'C'}
        predicted_ss = ''.join([ss_map[int(pred)] for pred in predictions[0]])

        # Get confidence scores
        probs = torch.softmax(outputs, dim=-1)
        confidence = torch.max(probs, dim=-1)[0].mean().item()

    return predicted_ss, confidence

# Example prediction
print("🧬 Example Prediction")
print("=" * 30)

test_sequence = "GWSTELEKHREELKEFLKKEGITNVEIRIDNGRLEVRVEGGTERLKRFLEELRQKLEKKGYTVDIKIE" #@param {type:"string"}

predicted_ss, confidence = predict_secondary_structure(test_sequence, model_esm, classifier, batch_converter, device)

print(f"Sequence:  {test_sequence}")
print(f"Predicted: {predicted_ss}")
print(f"Confidence: {confidence:.4f}")

# # Calculate accuracy for this example
# example_acc = sum(1 for t, p in zip(true_ss, predicted_ss) if t == p) / len(true_ss)
# print(f"Example accuracy: {example_acc:.4f}")



🧬 Example Prediction
Sequence:  GWSTELEKHREELKEFLKKEGITNVEIRIDNGRLEVRVEGGTERLKRFLEELRQKLEKKGYTVDIKIE
Predicted: CCHHHHHHHHHHHHHHHHHCCCCEEEEEECCCEEEEEECCCHHHHHHHHHHHHHHHHHCCCEEEEEEE
Confidence: 0.9186


In [None]:
# @title ##run **ESM-Secondary Structure Predictor** from FASTA file
# @markdown Run the cells, then upload the FASTA file
from google.colab import files

# Upload FASTA file interactively
uploaded = files.upload()

# Assuming only one file uploaded:
fasta_filename = next(iter(uploaded))

def parse_fasta(filename):
    sequence_lines = []
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith('>'):
                # Skip header lines
                continue
            sequence_lines.append(line.strip())
    # Join all sequence lines into one string
    return ''.join(sequence_lines)

# Parse uploaded FASTA file
test_sequence = parse_fasta(fasta_filename)

print(f"Parsed AA sequence from {fasta_filename}:")
print(test_sequence)

# Now run your prediction
predicted_ss, confidence = predict_secondary_structure(test_sequence, model_esm, classifier, batch_converter, device)

print(f"\nSequence:  {test_sequence}")
print(f"Predicted: {predicted_ss}")
print(f"Confidence: {confidence:.4f}")


Saving rcsb_pdb_1MBN.fasta to rcsb_pdb_1MBN.fasta
Parsed AA sequence from rcsb_pdb_1MBN.fasta:
VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASEDLKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG

Sequence:  VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASEDLKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG
Predicted: CCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHCCCHHHHHHHHHHHHHCCHHHHHHHHHHHHCCCCHHHHHHHHHHHHHHCCCCHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCC
Confidence: 0.8859
