# Library Import

In [None]:
#library import
import pandas as pd
import numpy as np
from tqdm import tqdm
import math

In [None]:
#part 2
import time
import matplotlib.pyplot as plt
import os
import gc

In [None]:
#part 3a basic
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
#part 3b
from torch.optim.lr_scheduler import ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score
import torch.optim as optim

In [None]:
#part 4 adjustment
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import Dataset #dataset
import torch.nn.functional as F

In [None]:
#extra modification
from torch.optim.lr_scheduler import LambdaLR

In [None]:
# Set random seeds for reproducibility
np.random.seed(36)
torch.manual_seed(36)
if torch.cuda.is_available():
    torch.cuda.manual_seed(36)

# Datasate maker and  Loading

## Configuration

In [None]:
#the first gc
gc.collect()

In [None]:
#main path from Kaggle
main_path = "/kaggle/input"
#path concern in main data (competition)
main_df_path = f"{main_path}/cafa-6-protein-function-prediction" #main path for protein

In [None]:
#data declaration in configuration
class Config:
    """Configuration settings for the model in main dataframe"""
    def __init__(self):
        self.main_dir = main_df_path
        self.train_sequences_path = f"{self.main_dir}/Train/train_sequences.fasta"
        self.train_labels_path = f"{self.main_dir}/Train/train_terms.tsv"
        self.test_sequences_path = f"{self.main_dir}/Test/testsuperset.fasta"
        self.ia_path = f"{self.main_dir}/IA.tsv"
        
        # Model parameters
        self.num_labels = 500
        self.n_epochs = 96
        self.batch_size = 96
        self.dropout_rate = 0.3
        self.patience = 15
        
        # Base learning rate
        self.lr = 4.75e-5  
        self.min_lr = 9.95e-8
        
        # Dynamic learning rate configuration
        self.use_dynamic_lr = True
        self.lr_scheduler_type = "exponential"  # selections are ['exponential', 'cosine', 'plateau']
        self.lr_params = {
            "decay_rate": 0.991,
            "decay_steps": 2,
            "min_lr": self.min_lr,
            "T_max": 10,  # cosine only
            "factor": 0.5,  # plateau only
        }

        # Device
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Embedding configurations
        self.embeds_map = {
            "ESM2": "cafa-5-ems-2-embeddings-numpy",
            "ProtBERT": "protbert-embeddings-for-cafa5",
            "T5": "t5embeds"
        }

        self.embeds_dim = {
            "ESM2": 1280,
            "ProtBERT": 1024,
            "T5": 1024
        }

In [None]:
# class declaration
config = Config()
print(f"Using device: {config.device}")

## Dataset loader

In [None]:
#dataset builder
class ProteinDataset(Dataset):
    """Dataset class for protein sequences and their GO annotations"""
    
    def __init__(self, datatype, embeddings_source, config):
        super().__init__()
        self.datatype = datatype
        self.config = config
        self.embeddings_source = embeddings_source
        
        # Load embeddings
        self._load_embeddings()
        
        # Load labels for training data
        if self.datatype == "train":
            self._load_labels()
        #loading embeded data
    def _load_embeddings(self):
        """Load pre-computed embeddings"""
        embed_dir = f"{main_path}/{self.config.embeds_map[self.embeddings_source]}"
        
        if self.embeddings_source == "ESM2":
            embeds = np.load(f"{embed_dir}/{self.datatype}_embeddings.npy")
            ids = np.load(f"{embed_dir}/{self.datatype}_ids.npy")
        elif self.embeddings_source == "ProtBERT":
            embeds = np.load(f"{embed_dir}/{self.datatype}_embeddings.npy")
            ids = np.load(f"{embed_dir}/{self.datatype}_ids.npy")
        elif self.embeddings_source == "T5":
            embeds = np.load(f"{embed_dir}/{self.datatype}_embeds.npy")
            ids = np.load(f"{embed_dir}/{self.datatype}_ids.npy")
        
        # Create DataFrame
        self.df = pd.DataFrame({
            "EntryID": ids,
            "embed": [embeds[i] for i in range(embeds.shape[0])]
        })
    #loading labels
    def _load_labels(self):
        """Load GO term labels for training data"""
        # Load pre-processed top labels if available
        label_file = f"/kaggle/input/train-targets-top{self.config.num_labels}/train_targets_top{self.config.num_labels}.npy"
        
        if os.path.exists(label_file):
            np_labels = np.load(label_file)
            df_labels = pd.DataFrame({
                'EntryID': self.df['EntryID'],
                'labels_vect': [row for row in np_labels]
            })
            self.df = self.df.merge(df_labels, on="EntryID", how="inner")
        else:
            # Process labels from scratch
            self._process_labels_from_tsv()
    def _process_labels_from_tsv(self):
        """Process labels from the TSV file"""
        labels_df = pd.read_csv(self.config.train_labels_path, sep="\t", names=["EntryID", "term", "aspect"])
        
        # Get top terms
        top_terms = labels_df.groupby("term")["EntryID"].count().sort_values(ascending=False)
        self.top_terms = top_terms[:self.config.num_labels].index.tolist()
        
        # Create label vectors
        label_vectors = []
        for entry_id in self.df['EntryID']:
            entry_terms = labels_df[labels_df['EntryID'] == entry_id]['term'].tolist()
            vector = [1 if term in entry_terms else 0 for term in self.top_terms]
            label_vectors.append(vector)
        
        self.df['labels_vect'] = label_vectors
    
    def __len__(self):
        return len(self.df)
    #getting item
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        embed = torch.tensor(row["embed"], dtype=torch.float32)
        
        if self.datatype == "train":
            labels = torch.tensor(row["labels_vect"], dtype=torch.float32)
            return embed, labels
        else:
            return embed, row["EntryID"]


# Model Building and scheduler

## The LoRA schedule

In [None]:
# 1. Building Low-Rank Adapter Layer (LoRA)
class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=6, alpha= 6, dropout= 0.035, bias=True):
        super().__init__()
        self.r = r
        self.alpha = alpha if alpha is not None else r
        self.scaling = self.alpha / max(1, self.r)
        self.linear = nn.Linear(in_features, out_features, bias=bias)
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()

        if r > 0:
            # low-rank matrices
            self.lora_A = nn.Parameter(torch.randn(r, in_features) * 0.01)
            self.lora_B = nn.Parameter(torch.zeros(out_features, r))
        else:
            self.register_parameter('lora_A', None)
            self.register_parameter('lora_B', None)

    def forward(self, x):
        base = self.linear(x)
        if self.r > 0:
            # compute adapter output (initially 0 due to B = 0)
            x_d = self.dropout(x)
            a_out = F.linear(x_d, self.lora_A)       # (batch, r)
            b_out = F.linear(a_out, self.lora_B)     # (batch, out)
            return base + b_out * self.scaling
        else:
            return base

In [None]:
# 2. LoRA-Enhanced Hybrid Model
class HybridModel(nn.Module):
    """Hybrid model combining CNN and LSTM features with LoRA for safe fine-tuning"""

    def __init__(self, input_dim, num_classes, dropout_rate=0.3, lora_r=6, lora_alpha= 6, lora_dropout=0.035):
        super().__init__()

        # ----- CNN branch (unchanged) -----
        self.conv1 = nn.Conv1d(1, 32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveMaxPool1d(128)

        # ----- LSTM branch (unchanged) -----
        self.lstm = nn.LSTM(input_dim, 256, num_layers=2,
                            bidirectional=True, dropout=dropout_rate, batch_first=True)

        # Optional small projection for LSTM output to apply LoRA safely
        self.lstm_proj = LoRALinear(512, 512, r=lora_r, alpha=lora_alpha, dropout=lora_dropout)

        # ----- Fusion and Classification (LoRA applied here) -----
        self.fc1 = LoRALinear(64 * 128 + 512, 512, r=lora_r, alpha=lora_alpha, dropout=lora_dropout)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = LoRALinear(512, 256, r=lora_r, alpha=lora_alpha, dropout=lora_dropout)
        self.fc3 = LoRALinear(256, num_classes, r=lora_r, alpha=lora_alpha, dropout=lora_dropout)

    def forward(self, x):
        batch_size = x.shape[0]

        # ----- CNN branch -----
        cnn_x = x.unsqueeze(1)           # (B, 1, seq_len)
        cnn_x = F.relu(self.conv1(cnn_x))
        cnn_x = F.relu(self.conv2(cnn_x))
        cnn_x = self.pool(cnn_x)
        cnn_x = cnn_x.view(batch_size, -1)

        # ----- LSTM branch -----
        lstm_x = x.unsqueeze(1)          # maintain consistency with original
        lstm_out, _ = self.lstm(lstm_x)
        lstm_x = lstm_out[:, -1, :]      # take final hidden state
        lstm_x = self.lstm_proj(lstm_x)  # safe LoRA projection

        # ----- Concatenate and classify -----
        combined = torch.cat([cnn_x, lstm_x], dim=1)

        x = F.relu(self.fc1(combined))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)

        return x

In [None]:
#initiating 'smooth loss'
class SmoothBCEWithLogitsLoss(nn.Module):
    def __init__(self, smoothing=0.0045):
        super().__init__()
        self.smoothing = smoothing
        self.loss = nn.BCEWithLogitsLoss()

    def forward(self, pred, target):
        target = target * (1 - self.smoothing) + 0.5 * self.smoothing
        return self.loss(pred, target)

In [None]:
#optional modificaiton in LoRA training
def mark_only_lora_trainable(model):
    for name, param in model.named_parameters():
        param.requires_grad = False
    for module in model.modules():
        if isinstance(module, LoRALinear):
            if module.lora_A is not None:
                module.lora_A.requires_grad = True
            if module.lora_B is not None:
                module.lora_B.requires_grad = True

In [None]:
#Gaining LoRA parameters
def get_lora_params(model):
    return [p for p in model.parameters() if p.requires_grad]

## Extra adjutments in schedule

In [None]:
# Making scheduler before training
def get_scheduler(optimizer, config):
    def lr_lambda(current_step):
        warmup_steps = 8
        total_steps = config.n_epochs
        if current_step < warmup_steps:
            return float(current_step) / float(max(1, warmup_steps))
        progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return max(0.1, 0.5 * (1.0 + np.cos(np.pi * progress)))
    return LambdaLR(optimizer, lr_lambda)

# The **'Running phase'** in training and validation

In [None]:
#gc case
gc.collect()

## running phase

In [None]:
#training phase
def train_epoch(model, dataloader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    losses = []
    
    for embeddings, labels in tqdm(dataloader, desc="Training"):
        embeddings, labels = embeddings.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    
    return np.mean(losses)

In [None]:
#validation
def validate_epoch(model, dataloader, criterion, metric, device):
    """Validate the result for one epoch"""
    model.eval()
    losses = []
    scores = []
    
    with torch.no_grad():
        for embeddings, labels in dataloader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            score = metric(torch.sigmoid(outputs), labels.int())
            
            losses.append(loss.item())
            scores.append(score.item())
    
    return np.mean(losses), np.mean(scores)

In [None]:
#running the training function with stabilizer
def train_model(embeddings_source="ESM2", model_type="hybrid", train_ratio= 0.8):
    print(f"\nTraining {model_type} model with {embeddings_source} embeddings...")

    # Dataset split
    dataset = ProteinDataset("train", embeddings_source, config)
    train_size = int(len(dataset) * train_ratio)
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2)

    # Model
    input_dim = config.embeds_dim[embeddings_source]
    model = HybridModel(input_dim, config.num_labels, dropout_rate= 0.275, lora_r= 6, lora_alpha= 6, lora_dropout=0.1).to(config.device)

    # Freeze all except LoRA
    mark_only_lora_trainable(model)
    lora_params = get_lora_params(model)

    # Optimizer + Scheduler
    criterion = SmoothBCEWithLogitsLoss(smoothing= 0.0045)
    optimizer = torch.optim.AdamW(lora_params, lr=config.lr * 0.5, weight_decay= 4.4e-4)
    scheduler = get_scheduler(optimizer, config)
    metric = MultilabelF1Score(num_labels=config.num_labels, average='micro').to(config.device)

    best_val_score = 0
    train_losses, val_losses, val_scores, lrs = [], [], [], []

    # ---------- Training Loop ----------
    for epoch in range(config.n_epochs):
        model.train()
        epoch_losses = []
        grad_means = []

        for embeddings, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.n_epochs}"):
            embeddings, labels = embeddings.to(config.device), labels.to(config.device)
            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping for stability
            torch.nn.utils.clip_grad_norm_(lora_params, max_norm=1.0)

            optimizer.step()
            epoch_losses.append(loss.item())

            # Track mean gradient magnitude (diagnostic)
            with torch.no_grad():
                grads = [p.grad.abs().mean().item() for p in lora_params if p.grad is not None]
                if grads:
                    grad_means.append(np.mean(grads))

        train_loss = np.mean(epoch_losses)
        mean_grad = np.mean(grad_means) if grad_means else 0.0

        # ---------- Validation ----------
        val_loss, val_score = validate_epoch(model, val_loader, criterion, metric, config.device)

        # ---------- Scheduler step ----------
        scheduler.step()
        current_lr = optimizer.param_groups[0]["lr"]
        lrs.append(current_lr)

        # ---------- Logging ----------
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_scores.append(val_score)

        print(f"Epoch {epoch+1}/{config.n_epochs} | "
              f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | "
              f"Val F1: {val_score:.4f} | LR: {current_lr:.6e} | Mean Grad: {mean_grad:.6f}")

        # Save best model
        if val_score > best_val_score:
            best_val_score = val_score
            best_model_state = model.state_dict().copy()

    # ---------- Return ----------
    return model, {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'val_scores': val_scores,
        'lrs': lrs,
        'best_score': best_val_score
    }

In [None]:
#another gc
gc.collect()

In [None]:
# actual running
model, history = train_model(
    embeddings_source="ESM2",
    model_type="hybrid"
)

## training evaluation

In [None]:
#plot in training
def plot_training_curves(history):
    epochs = range(1, len(history['train_losses']) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history['train_losses'], label='Train Loss', color='black')
    plt.plot(epochs, history['val_losses'], label='Validation Loss', color='red')
    plt.title('Training & Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, history['val_scores'], label='Validation F1', color='green')
    plt.title('Validation F1 Score')
    plt.xlabel('Epoch')
    plt.ylabel('F1 Score')
    plt.legend()

    plt.show()

In [None]:
#plot in learning curve
def plot_learning_rate(history):
    plt.figure(figsize=(6,4))
    plt.plot(history['lrs'], marker='o', color='purple')
    plt.title('Learning Rate over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('LR')
    plt.yscale('log')
    plt.show()

In [None]:
#tunning training plot
plot_training_curves(history)

In [None]:
#running learning curve plot
plot_learning_rate(history)

In [None]:
#final result display
print(f"Best Validation F1 Score: {history['best_score']:.4f}")

## prediction inside the validation

In [None]:
#function to predict actual results
def predict(model, embeddings_source="ESM2"):
    """Generate predictions for test set"""
    
    print("\nGenerating predictions...")
    
    # Load test dataset
    test_dataset = ProteinDataset("test", embeddings_source, config)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    
    # Get label names
    labels_df = pd.read_csv(config.train_labels_path, sep="\t", names=["EntryID", "term", "aspect"])
    top_terms = labels_df.groupby("term")["EntryID"].count().sort_values(ascending=False)
    label_names = top_terms[:config.num_labels].index.tolist()
    
    # Generate predictions
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for embeddings, protein_id in tqdm(test_loader, desc="Predicting"):
            embeddings = embeddings.to(config.device)
            outputs = torch.sigmoid(model(embeddings)).cpu().numpy().squeeze()
            
            for i, conf in enumerate(outputs):
                if conf > 0.05:  # Only include predictions above threshold
                    predictions.append({
                        'Id': protein_id[0],
                        'GO term': label_names[i],
                        'Confidence': min(conf, 0.95)  # Scientific standard
                    })
    
    return pd.DataFrame(predictions)

In [None]:
#model prediciton
preds = predict(model, embeddings_source="ESM2")

# Data Submission

In [None]:
#declaring main prediction
existing_pred_path = f"{main_path}/blast-quick-sprof-zero-pred/submission.tsv"

In [None]:
#preparing submission
existing = pd.read_csv(existing_pred_path, sep='\t', header=None, names=['Id', 'GO term', 'Confidence'])

In [None]:
#merging the data for actual submission
merged = pd.merge(existing, preds, on=['Id', 'GO term'], how='outer', suffixes=('_existing', '_new'))

In [None]:
#emergency gc
gc.collect()

In [None]:
# Combine confidences (take maximum or average)
merged['Confidence'] = merged[['Confidence_existing', 'Confidence_new']].max(axis=1)

In [None]:
#actual submitted data
final_predictions = merged[['Id', 'GO term', 'Confidence']].copy()

In [None]:
# Saving submission
print("\nSaving submission file...")
final_predictions.to_csv('submission.tsv', sep='\t', header=False, index=False)
print(f"Submission saved with {len(final_predictions)} predictions")