# NetMHCII Prediction Results Analysis

This notebook provides comprehensive analysis of NetMHCII prediction results, focusing on binding distributions and peptide characteristics.

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from src.analysis import BindingDistributionPlotter, PlotConfig
from src.predictor.utils import FileManager

## Load Results


In [None]:
# Initialize FileManager
file_manager = FileManager("/path/to/results")

# Load all prediction results
def load_all_results(results_dir: str) -> dict:
    results = {}
    for file in Path(results_dir).glob("*_predictions.csv"):
        sample_id = file.stem.replace("_predictions", "")
        df = pd.read_csv(file)
        results[sample_id] = df
    return results

results_dict = load_all_results("/path/to/results")
print(f"Loaded results for {len(results_dict)} samples")

## No-transformation Analysis

In [None]:
# Filter for raw peptides without transformations
def get_raw_predictions(df: pd.DataFrame) -> pd.DataFrame:
    return df[
        (df['inverted_manual'] == 0) & 
        (df['flipped'] == 'raw')
    ]

raw_results = {
    sample_id: get_raw_predictions(df)
    for sample_id, df in results_dict.items()
}

# Initialize plotter
plotter = BindingDistributionPlotter(
    PlotConfig(figsize=(12, 6))
)

# Plot overall distribution
fig = plotter.plot_rank_distribution(
    pd.concat(raw_results.values()),
    title="Binding Rank Distribution (No Transformations)"
)

## Transformation Analysis

In [None]:
# Analyze different transformation combinations
transformations = {
    'No Transform': lambda df: df[(df['inverted_manual'] == 0) & (df['flipped'] == 'raw')],
    'Inverted': lambda df: df[(df['inverted_manual'] == 1) & (df['flipped'] == 'raw')],
    'Flipped': lambda df: df[(df['inverted_manual'] == 0) & (df['flipped'] != 'raw')],
    'All Transform': lambda df: df[(df['inverted_manual'] == 1) | (df['flipped'] != 'raw')]
}

# Create comparison plot
transformed_dfs = {
    name: pd.concat([transform(df) for df in results_dict.values()])
    for name, transform in transformations.items()
}

fig = plotter.plot_comparison(
    transformed_dfs,
    title="Effect of Sequence Transformations on Binding"
)

## Length Analysis

In [None]:
# Analyze peptide length distribution
def analyze_length_distribution(results_dict: dict) -> pd.DataFrame:
    all_data = []
    for sample_id, df in results_dict.items():
        df = df.copy()
        df['length'] = df['Peptide'].str.len()
        df['sample_id'] = sample_id
        all_data.append(df)
    
    return pd.concat(all_data)

length_data = analyze_length_distribution(results_dict)

# Plot length vs binding
plt.figure(figsize=(12, 6))
sns.boxplot(
    data=length_data,
    x='length',
    y='%Rank_EL',
    showfliers=False
)
plt.yscale('log')
plt.title('Peptide Length vs Binding Rank')

## Sample Comparison

In [None]:
# Compare binding distributions across samples
fig = plotter.plot_comparison(
    results_dict,
    title="Sample-wise Binding Distribution"
)

# Calculate and display statistics
stats = pd.DataFrame({
    sample_id: {
        'Total Peptides': len(df),
        'Strong Binders (<2%)': (df['%Rank_EL'] < 2).sum(),
        'Weak Binders (2-10%)': ((df['%Rank_EL'] >= 2) & (df['%Rank_EL'] < 10)).sum(),
        'Non-binders': (df['%Rank_EL'] >= 10).sum(),
        'Median Rank': df['%Rank_EL'].median()
    }
    for sample_id, df in results_dict.items()
}).T

display(stats)

## Binding Motif Analysis

In [None]:
# Analyze core sequences of strong binders
def analyze_binding_motifs(df: pd.DataFrame, rank_threshold: float = 2.0) -> pd.DataFrame:
    strong_binders = df[df['%Rank_EL'] <= rank_threshold]
    return pd.DataFrame({
        'Core': strong_binders['Core'],
        'Score_EL': strong_binders['Score_EL'],
        'Rank_EL': strong_binders['%Rank_EL']
    })

# Get motifs for each sample
motifs = {
    sample_id: analyze_binding_motifs(df)
    for sample_id, df in results_dict.items()
}

# Plot motif characteristics
for sample_id, motif_df in list(motifs.items())[:3]:  # Show first 3 samples
    print(f"\nSample {sample_id} motif characteristics:")
    print(motif_df['Core'].value_counts().head())

## Save Analysis Results

In [None]:
# Save summary statistics
stats.to_csv("binding_statistics.csv")

# Save strong binders for each sample
for sample_id, df in results_dict.items():
    strong_binders = df[df['%Rank_EL'] < 2]
    strong_binders.to_csv(f"{sample_id}_strong_binders.csv", index=False)

print("Analysis results saved.")