In [None]:
# ============================================================
# 0. Install libraries
# ============================================================
# 1) remove pre-installed packages
!pip uninstall -y transformers accelerate tensorflow tensorflow-text tf-keras

# 2) reinstall packages
!pip install -q "numpy==1.26.4" "torch==2.3.1" \
  "transformers==4.44.0" "accelerate==0.33.0" \
  "peft==0.11.1" "datasets==2.21.0" "scikit-learn"

In [None]:
# ============================================================
# 1. Imports & basic setup
# ============================================================
import os
import random
import numpy as np
from tqdm.auto import tqdm

# matplot
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from peft import LoraConfig, get_peft_model, TaskType

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

import pandas as pd

In [None]:
# Set random seeds for reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

In [None]:
# ============================================================
# 2. Upload & unzip VF_sequences.zip (train/test FASTA files)
#    Expected filenames:
#      - train_positive.fasta
#      - train_negative.fasta
#      - test_positive.fasta
#      - test_negative.fasta
# ============================================================
from google.colab import files

print("Please upload VF_sequences.zip (train/test positive/negative FASTA files).")
uploaded = files.upload()

# If the zip has a different name, modify this variable
zip_name = [name for name in uploaded.keys() if name.endswith(".zip")][0]
print("Unzipping:", zip_name)
!unzip -o $zip_name

print("Files in current directory:")
print(os.listdir("."))

In [None]:
# ============================================================
# 3. FASTA parser
# ============================================================
def read_fasta_with_label(path, label):
    """
    Read a FASTA file and return a list of dicts:
    [{"seq": sequence_string, "label": label}, ...]
    """
    data = []
    seq = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if seq:
                    data.append({"seq": "".join(seq), "label": label})
                    seq = []
            else:
                seq.append(line)
        # last sequence
        if seq:
            data.append({"seq": "".join(seq), "label": label})
    return data

train_pos_path = "train_positive.fasta"
train_neg_path = "train_negative.fasta"
test_pos_path  = "test_positive.fasta"
test_neg_path  = "test_negative.fasta"

assert os.path.exists(train_pos_path), "train_positive.fasta not found!"
assert os.path.exists(train_neg_path), "train_negative.fasta not found!"
assert os.path.exists(test_pos_path), "test_positive.fasta not found!"
assert os.path.exists(test_neg_path), "test_negative.fasta not found!"

train_pos = read_fasta_with_label(train_pos_path, 1)
train_neg = read_fasta_with_label(train_neg_path, 0)
test_pos  = read_fasta_with_label(test_pos_path, 1)
test_neg  = read_fasta_with_label(test_neg_path, 0)

train_data = train_pos + train_neg
test_data  = test_pos + test_neg

print(f"#Train sequences: {len(train_data)} (pos: {len(train_pos)}, neg: {len(train_neg)})")
print(f"#Test sequences:  {len(test_data)} (pos: {len(test_pos)}, neg: {len(test_neg)})")

# Create validation split from train
train_list, valid_list = train_test_split(
    train_data,
    test_size=0.2,
    stratify=[d["label"] for d in train_data],
    random_state=42,
)

print(f"Train: {len(train_list)}, Valid: {len(valid_list)}, Test: {len(test_data)}")


In [None]:
# ============================================================
# 4. Dataset & Tokenizer
#    Using a small ESM2 model as pre-trained protein LM
# ============================================================
MODEL_NAME = "facebook/esm2_t6_8M_UR50D"  # small & Colab-friendly

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)

MAX_LENGTH = 512  # adjust if needed (too long sequences -> truncate)

class FastaSequenceDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = self.examples[idx]
        seq = item["seq"]
        label = item["label"]

        # ESM tokenizers usually work with plain amino-acid strings
        encoded = self.tokenizer(
            seq,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",   # Trainer will handle padding too, but this is fine
            return_tensors="pt",
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

train_dataset = FastaSequenceDataset(train_list, tokenizer, MAX_LENGTH)
valid_dataset = FastaSequenceDataset(valid_list, tokenizer, MAX_LENGTH)
test_dataset  = FastaSequenceDataset(test_data,  tokenizer, MAX_LENGTH)

In [None]:
# ============================================================
# 5. Metrics function (accuracy, precision, recall, F1)
# ============================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [None]:
# ============================================================
# 6. Helper: Train & evaluate a model (base or LoRA)
# ============================================================
def train_and_evaluate_model(
    model_name,
    output_dir,
    num_labels=2,
    lora_config: LoraConfig = None,
    num_train_epochs=3,
    learning_rate=5e-4,
):
    """
    model_name: HF model name
    lora_config: if not None, LoRA will be applied on top of the base model
    returns: metrics dict on validation and test
    """

    set_seed(42)

    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        ignore_mismatched_sizes=True,  # needed when adding classification head
    )

    if lora_config is not None:
        model = get_peft_model(base_model, lora_config)
        print("Using LoRA. Trainable parameters:")
        model.print_trainable_parameters()
    else:
        model = base_model
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Base model total params: {total_params:,}, trainable: {trainable_params:,}")

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    print("Evaluating on validation set...")
    val_metrics = trainer.evaluate(eval_dataset=valid_dataset)
    print("Validation metrics:", val_metrics)

    print("Evaluating on test set...")
    test_metrics = trainer.evaluate(eval_dataset=test_dataset)
    print("Test metrics:", test_metrics)

    return val_metrics, test_metrics

In [None]:
# ============================================================
# 7. Base model (full fine-tuning) vs LoRA model (single r value)
# ============================================================

# ---- 7-1. Base model (no LoRA, full fine-tuning) ----
print("========== BASE MODEL (full fine-tuning) ==========")
base_val_metrics, base_test_metrics = train_and_evaluate_model(
    model_name=MODEL_NAME,
    output_dir="./results_base_full_ft",
    num_train_epochs=3,
    learning_rate=5e-5,
)

# ---- 7-2. LoRA model with a single r (e.g., r = 8) ----
print("\n========== LoRA MODEL (r = 8) ==========")
lora_config_r8 = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value"],
)

lora_val_r8, lora_test_r8 = train_and_evaluate_model(
    model_name=MODEL_NAME,
    output_dir="./results_lora_r8",
    num_train_epochs=3,
    learning_rate=5e-4,
    lora_config=lora_config_r8,
)

In [None]:
# ============================================================
# 8. LoRA rank r = 3~10 performance comparison
# ============================================================

result_list = []

print("========== LORA MODEL ==========")

for r in range(3, 11):

    print(f"current r = {r}")

    lora_config_r = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=r,
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        target_modules=["query", "key", "value"],
    )

    lora_val_r, lora_test_r = train_and_evaluate_model(
        model_name=MODEL_NAME,
        output_dir=f"./results_lora_{r}",
        num_train_epochs=3,
        learning_rate=5e-4,
        lora_config=lora_config_r,
    )

    '''
    출력 예시:
    results = trainer.evaluate()
    print(results)

    {
        'eval_loss': 0.1234,
        'eval_accuracy': 0.8578,
        'eval_f1': 0.8997,
        'eval_runtime': 1.23,
        'eval_samples_per_second': 45.6,
        'eval_steps_per_second': 5.4,
        'epoch': 1.0 
    }

    result_list.append()
    '''

    result_list.append({
        "LoRa_rank": r,
        "validation_loss": lora_val_r["eval_loss"],
        "validation_accuracy": lora_val_r["eval_accuracy"],
        "validation_f1": lora_val_r["eval_f1"],
        "test_loss": lora_test_r["eval_loss"],
        "test_accuracy": lora_test_r["eval_accuracy"],
        "test_f1": lora_test_r["eval_f1"]
    })
    

# ============================================================
# 9. Result summary
# ===========================================================

# Task 1

print("1. Base Model")
print("")

# loss X
print(f"Validation accuracy: {base_val_metrics['eval_accuracy']}")
print(f"Validation f1: {base_val_metrics['eval_f1']}")
print(f"Test accuracy: {base_test_metrics['eval_accuracy']}")
print(f"Test f1: {base_test_metrics['eval_f1']}")

print("=" * 50)

print("2. LoRa, r = 8")
print("")

print(f"Validation accuracy: {lora_val_r8['eval_accuracy']}")
print(f"Validation f1: {lora_val_r8['eval_f1']}")
print(f"Test accuracy: {lora_test_r8['eval_accuracy']}")
print(f"Test f1: {lora_test_r8['eval_f1']}")

print("=" * 50)

# Task 2

LoRA_pd = pd.DataFrame(result_list)

figure, (axis_1, axis_2) = plt.subplots(1, 2)

axis_1.plot(LoRA_pd['rank'], LoRA_pd['validation_accuracy'], label = 'validation_acc')
axis_1.plot(LoRA_pd['rank'], LoRA_pd['test_accuracy'], label = 'test_acc')

axis_1.set_xlabel('LoRA Rank r')
axis_1.set_ylabel('accuracy')

axis_2.plot(LoRA_pd['rank'], LoRA_pd['validation_f1'], label = 'validation_f1')
axis_2.plot(LoRA_pd['rank'], LoRA_pd['test_f1'], label = 'test_f1')

axis_2.set_xlabel('LoRA Rank r')
axis_2.set_ylabel('f1_score')

plt.show()
