<a href="https://colab.research.google.com/github/Tar-ive/protein-DL/blob/main/amino_acid_hack_nation_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
 import kagglehub

# Download latest version
path = kagglehub.dataset_download("googleai/pfam-seed-random-split")

print("Path to dataset files:", path)

In [None]:
import os

In [None]:
import os

# Check what's actually in the dataset directory
print("Contents of dataset directory:")
for item in os.listdir(path):
    print(f"  {item}")
    if os.path.isdir(os.path.join(path, item)):
        print(f"    Contents of {item}:")
        for subitem in os.listdir(os.path.join(path, item)):
            print(f"      {subitem}")

In [None]:
inner_path = os.path.join(path, 'random_split', 'random_split')


In [None]:
def read_data_from_sharded_files(subdir_name, base_path):
    """Read all sharded data files from a subdirectory and concatenate them"""
    dir_path = os.path.join(base_path, subdir_name)
    data_frames = []

    # Get all files and sort them to maintain order
    files = sorted([f for f in os.listdir(dir_path) if f.startswith('data-')])

    for file in files:
        file_path = os.path.join(dir_path, file)
        try:
            # Try reading as parquet first (most likely format)
            df = pd.read_parquet(file_path)
            data_frames.append(df)
        except:
            try:
                # If parquet fails, try as CSV
                df = pd.read_csv(file_path)
                data_frames.append(df)
            except Exception as e:
                print(f"Could not read {file}: {e}")

    if data_frames:
        return pd.concat(data_frames, ignore_index=True)
    else:
        print(f"No readable files found in {dir_path}")
        return None

# Use the new function to load your data
train = read_data_from_sharded_files('train', inner_path)
dev = read_data_from_sharded_files('dev', inner_path)
test = read_data_from_sharded_files('test', inner_path)

print(f"Train shape: {train.shape if train is not None else 'Failed to load'}")
print(f"Dev shape: {dev.shape if dev is not None else 'Failed to load'}")
print(f"Test shape: {test.shape if test is not None else 'Failed to load'}")

In [None]:
train.head()


In [None]:
train.shape

In [None]:
dev.shape

In [None]:
test.shape

Looking at families in the training data

In [None]:
partitions = {'train': train, 'dev': dev, 'test': test}


In [None]:
def get_information(partitions):
    columns = ['partition', 'nb_samples', 'nb_families', 'min_samples_per_fam', 'max_samples_per_fam', 'mean_samples_per_fam']
    df_info = pd.DataFrame(columns=columns)
    for name, df in partitions.items():
        # Use pd.concat instead of df.append
        df_info = pd.concat([df_info, pd.DataFrame([{
            'partition': name,
            'nb_samples': len(df),
            'nb_families': df['family_accession'].unique().size,
            'max_samples_per_fam': df.groupby('family_accession').size().max(),
            'min_samples_per_fam': df.groupby('family_accession').size().min(),
            'mean_samples_per_fam': df.groupby('family_accession').size().mean(),
        }])], ignore_index=True)
    return df_info

get_information(partitions)

In [None]:
train_families = set(train['family_accession'].unique())
dev_families = set(dev['family_accession'].unique())
test_families = set(test['family_accession'].unique())
print('Are the families of the dev set and the test set the same ?', dev_families == test_families)

common_families = train_families & dev_families & test_families # Take the intersection with the '&' operator
print('Number of common families in all sets : ', len(common_families))

Excluding the families that are only in train but not in dev and test


In [None]:
train = train[train['family_accession'].isin(common_families)]
partitions['train'] = train

print('Updated info on the datasets')
get_information(partitions)

In [None]:
plt.figure(figsize = (30, 10))
plt.suptitle('Distribution of family sizes', fontsize=18, y=0.95)
colors = ['tab:blue', 'tab:orange', 'tab:green']

for n, (name, df) in enumerate(partitions.items()):
    # Create the subpot
    ax = plt.subplot(1, 3, n + 1)
    ax.set_title(name)
    ax.set_xlabel("Family size")
    ax.set_ylabel("Number of families")

    # Plot data
    df.groupby('family_id').size().hist(bins=100, ax=ax, color=colors[n])


# Finetuning Environemnt Setup

In [None]:
!pip install transformers[torch] datasets evaluate scikit-learn


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import matplotlib.pyplot as plt

In [None]:
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

In [None]:
# You already have this data loaded, so let's just verify it
print("Data shapes:")
print(f"Train: {train.shape}")
print(f"Dev: {dev.shape}")
print(f"Test: {test.shape}")

# Check the column names
print(f"\nTrain columns: {train.columns.tolist()}")

In [None]:
# Combine all datasets for sampling strategy
all_data = pd.concat([train, dev, test], ignore_index=True)
print(f"Total dataset size: {all_data.shape}")

# Explore the family distribution
family_counts = all_data['family_accession'].value_counts()
print(f"Number of unique families: {len(family_counts)}")
print(f"Most common families:")
print(family_counts.head(10))

To speed things up, I selected a smaller, representative sample. A good starting point was to take the top 1,000 most frequent families and then take up to 100 examples from each of those families. This gives me a balanced and manageable dataset of around 100,000 sequences

In [None]:
# Get top 1000 most frequent families
top_1000_families = family_counts.head(1000).index.tolist()
print(f"Selected top {len(top_1000_families)} families")

# Filter data to only include these families
filtered_data = all_data[all_data['family_accession'].isin(top_1000_families)]
print(f"Filtered dataset size: {filtered_data.shape}")

In [None]:
# Sample up to 100 sequences per family for balanced training
sampled_data = []

for family in top_1000_families:
    family_data = filtered_data[filtered_data['family_accession'] == family]
    # Sample up to 100, or all if less than 100
    sample_size = min(100, len(family_data))
    sampled_family = family_data.sample(n=sample_size, random_state=42)
    sampled_data.append(sampled_family)

# Combine all sampled data
balanced_dataset = pd.concat(sampled_data, ignore_index=True)
print(f"Balanced dataset size: {balanced_dataset.shape}")
print(f"Average samples per family: {len(balanced_dataset) / len(top_1000_families):.1f}")

In [None]:
# Extract sequences and labels
sequences = balanced_dataset['sequence'].tolist()
family_labels = balanced_dataset['family_accession'].tolist()

print(f"Number of sequences: {len(sequences)}")
print(f"Number of labels: {len(family_labels)}")
print(f"Example sequence length: {len(sequences[0])}")
print(f"Example sequence: {sequences[0][:50]}...")

In [None]:
# Convert family accession strings to numbers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(family_labels)

print(f"Label encoding complete!")
print(f"Number of unique labels: {len(label_encoder.classes_)}")
print(f"Example mappings:")
for i in range(5):
    print(f"  {family_labels[i]} -> {encoded_labels[i]}")

In [None]:
# Load the ESM-2 tokenizer
model_checkpoint = "facebook/esm2_t12_35M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print(f"Tokenizer loaded: {model_checkpoint}")
print(f"Vocabulary size: {tokenizer.vocab_size}")

In [None]:
# Tokenize all sequences (this might take a few minutes)
print("Tokenizing sequences...")
tokenized_sequences = tokenizer(
    sequences,
    truncation=True,
    padding=True,
    max_length=512,  # Adjust if needed based on your sequence lengths
    return_tensors="pt"
)

print("Tokenization complete!")
print(f"Input shape: {tokenized_sequences['input_ids'].shape}")

In [None]:
# Split into train and test sets (80/20 split)
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    sequences,
    encoded_labels,
    test_size=0.2,
    random_state=42,
    stratify=encoded_labels  # Ensure balanced split across families
)

print(f"Training set size: {len(train_sequences)}")
print(f"Test set size: {len(test_sequences)}")

In [None]:
# Tokenize the split data
train_tokenized = tokenizer(
    train_sequences,
    truncation=True,
    padding=True,
    max_length=512
)

test_tokenized = tokenizer(
    test_sequences,
    truncation=True,
    padding=True,
    max_length=512
)

print("Split data tokenized!")

In [None]:
# Create Hugging Face Dataset objects
train_dataset = Dataset.from_dict(train_tokenized)
test_dataset = Dataset.from_dict(test_tokenized)

# Add labels
train_dataset = train_dataset.add_column("labels", train_labels.tolist())
test_dataset = test_dataset.add_column("labels", test_labels.tolist())

print("Final datasets created!")
print(f"Train dataset: {train_dataset}")
print(f"Test dataset: {test_dataset}")
print(f"Number of labels: {len(label_encoder.classes_)}")

In [None]:
# Load ESM-2 model for sequence classification
num_labels = 1000  # Your number of protein families
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)

print(f"Model loaded with {num_labels} output classes")
print(f"Model size: {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters")

In [None]:
# Login to Hugging Face to enable automatic upload
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import TrainingArguments

# Create a descriptive model name
model_name = model_checkpoint.split("/")[-1]
output_dir = f"{model_name}-finetuned-pfam-1k"

args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust if you get memory errors
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,  # This will auto-upload to HF!
    hub_model_id=f"Tarive/{output_dir}",  # Replace with your HF username
    hub_strategy="every_save",
    logging_steps=100,
    eval_steps=500,
    save_steps=500,
)

print(f"Training will save to: {output_dir}")
print(f"Model will be uploaded to: Tarive/{output_dir}")

In [None]:
from evaluate import load
import numpy as np

# Load accuracy metric
metric = load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

print("Evaluation metrics defined!")

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Trainer created! Ready to start training...")

In [None]:
# This is the big moment - start training!
print("🚀 Starting training...")
print("This will take approximately 15-30 minutes on T4 GPU")
print("You'll see progress bars and accuracy metrics")

trainer.train()

print("✅ Training complete!")

In [None]:
import pickle

# Save the label encoder to a file named 'label_encoder.pkl'
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("✅ LabelEncoder saved to label_encoder.pkl")