## IMPORTS & DATA

In [None]:
import torch
import pandas as pd
from pathlib import Path
from typing import List, Tuple, Dict, Union
import warnings
warnings.filterwarnings('ignore')

import sys, pathlib, os
project_root = pathlib.Path.home() / "projets" / "protein-generation"
sys.path.append(str(project_root))

from scripts.evaluation.evaluate import *

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model names
PPL_MODEL_NAME = "facebook/esm2_t6_8M_UR50D"  # For perplexity calculation
FOLD_MODEL_NAME = "facebook/esmfold_v1"  # For structure prediction

# Global variables to store models (avoiding reloading)
ppl_model = None
ppl_tokenizer = None
fold_model = None
fold_tokenizer = None

# Load both models
ppl_model, ppl_tokenizer = load_perplexity_model(ppl_model_name=PPL_MODEL_NAME, device=device)
fold_model, fold_tokenizer = load_folding_model(fold_model_name=FOLD_MODEL_NAME, device=device)

print("Models loaded successfully!")


In [None]:
out_dir = '/home/arthur/projets/protein-generation/experiments/metrics/test2_circular'
print(f"Output directory: {out_dir}")

In [None]:
SEQUENCE_LENGTH = 100
N_SAMPLES = 1000


df = pd.read_csv("/home/arthur/projets/protein-generation/data/seq_clean_L100.csv")
df_gen = pd.read_csv("/home/arthur/projets/protein-generation/experiments/models/exp_masking_forward_masking_denoise_20250715_112806/generated_sequences.csv")
df_rand = pd.read_csv("/home/arthur/projets/protein-generation/data/seq_random_L100.csv")

training_sequences = df["sequence"].str[:SEQUENCE_LENGTH].iloc[:N_SAMPLES].tolist()
generated_sequences = df_gen["sequence"].str[:SEQUENCE_LENGTH].iloc[:N_SAMPLES].tolist()
random_sequences = df_rand["sequence"].str[:SEQUENCE_LENGTH].iloc[:N_SAMPLES].tolist()


In [None]:
def circular_cut(sequence, min_bound=10, max_bound=90):
    cut_position = random.randint(min_bound, max_bound)
    return sequence[cut_position:] + sequence[:cut_position]

def repeated_circular_cut(sequence, k, min_bound=10, max_bound=90):
    result = sequence
    for i in range(k):
        result = circular_cut(result, min_bound, max_bound)
    return result

generated_circular_5 = [repeated_circular_cut(seq, 5) for seq in generated_sequences]
generated_circular_10 = [repeated_circular_cut(seq, 10) for seq in generated_sequences]
generated_circular_20 = [repeated_circular_cut(seq, 20) for seq in generated_sequences]
generated_circular_50 = [repeated_circular_cut(seq, 50) for seq in generated_sequences]
generated_circular_100 = [repeated_circular_cut(seq, 100) for seq in generated_sequences]

In [None]:

generated_datasets = {
    'training': training_sequences,  # List of training sequences
    'random': random_sequences,  # List of random sequences for comparison
    'generated': generated_sequences,  # List of generated sequences from the model
    'circular_5': generated_circular_5,  # List of sequences with 5 circular cuts
    'circular_10': generated_circular_10,  # List of sequences with 10 circular cuts
    'circular_20': generated_circular_20,  # List of sequences with 20 circular cuts
    'circular_50': generated_circular_50,  # List of sequences with 50 circular cuts
    'circular_100': generated_circular_100,  # List of sequences with 100 circular cuts
    'generated': random_sequences # List of random sequences for comparison
    }


print(f"Training sequences: {len(training_sequences)}")
for name, seqs in generated_datasets.items():
    print(f"Generated dataset '{name}': {len(seqs)} sequences")

## QUALITY

In [None]:
# Calculate quality metrics for training sequences
print("Evaluating training sequences...")
training_quality_results = evaluate_quality(
    sequences=training_sequences,
    ppl_model=ppl_model,
    ppl_tokenizer=ppl_tokenizer,
    fold_model=fold_model,
    fold_tokenizer=fold_tokenizer,
    output_file=f"{out_dir}/training_sequences",
    device=device,
    batch_size=256
)


# Calculate quality metrics for all generated datasets
generated_quality_results = {}
for dataset_name, sequences in generated_datasets.items():
    print(f"Evaluating generated dataset '{dataset_name}'...")
    quality_results = evaluate_quality(
        sequences=sequences,
        ppl_model=ppl_model,
        ppl_tokenizer=ppl_tokenizer,
        fold_model=fold_model,
        fold_tokenizer=fold_tokenizer,
        output_file=f"{out_dir}/{dataset_name}_sequences",
        device=device,
        batch_size=256
    )
    generated_quality_results[dataset_name] = quality_results

## DISTRIBUTION

In [None]:
distribution_results = {}
for dataset_name, sequences in generated_datasets.items():
    print(f"Calculating distribution metrics for '{dataset_name}' vs training...")
    
    dist_results = evaluate_distributions(
        generated_sequences=sequences,
        training_sequences=training_sequences,
        ppl_model=ppl_model,
        ppl_tokenizer=ppl_tokenizer,
        output_file=f"{out_dir}/{dataset_name}_vs_training",
        device=device,
        k_neighbors=5,
        soft_align_k=5,
        n_projections=100,
        kde_bandwidth=0.1,
        kde_n_samples=1000,
        toppr_alpha=0.1,
        random_seed=42
    )
    
    distribution_results[dataset_name] = dist_results
