# UniProt Mapping and Gene Expression Analysis

This notebook analyzes the biological context of predicted peptides by mapping to UniProt and integrating gene expression data.

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import requests
import xml.etree.ElementTree as ET
from src.analysis import TargetAnalysisPlotter, PlotConfig
from src.predictor.utils import FileManager

# Configure visualization
plot_config = PlotConfig(figsize=(12, 8))
target_plotter = TargetAnalysisPlotter(plot_config)

## Load Prediction Results

In [None]:
# Load prediction results
def load_prediction_results(results_dir: str) -> pd.DataFrame:
    all_results = []
    for file in Path(results_dir).glob("*_predictions.csv"):
        df = pd.read_csv(file)
        df['sample_id'] = file.stem.replace("_predictions", "")
        all_results.append(df)
    return pd.concat(all_results)

predictions = load_prediction_results("/path/to/results")

## UniProt Mapping

In [None]:
def get_uniprot_info(accession: str) -> dict:
    """Fetch protein information from UniProt."""
    base_url = "https://rest.uniprot.org/uniprotkb/"
    response = requests.get(f"{base_url}{accession}.xml")
    if response.status_code != 200:
        return None
    
    root = ET.fromstring(response.content)
    
    # Extract information
    gene_element = root.find(".//{http://uniprot.org/uniprot}gene/{http://uniprot.org/uniprot}name[@type='primary']")
    gene_name = gene_element.text if gene_element is not None else None
    
    return {
        'accession': accession,
        'gene_name': gene_name
    }

# Get unique protein accessions
unique_proteins = predictions['Protein.Accession'].unique()

# Map to UniProt
protein_info = {}
for accession in unique_proteins:
    info = get_uniprot_info(accession)
    if info:
        protein_info[accession] = info

# Add gene names to predictions
predictions['gene_name'] = predictions['Protein.Accession'].map(
    lambda x: protein_info.get(x, {}).get('gene_name')
)

## Load Gene Expression Data

In [None]:
# Load TPM data
expression_data = pd.read_csv("/path/to/expression_data.csv")

# Load metadata
metadata = pd.read_csv("/path/to/metadata.csv")

# Merge metadata with expression data
expression_with_meta = pd.merge(
    expression_data,
    metadata,
    on='sample_id'
)

## Tissue-specific Analysis

In [None]:
# Group samples by tissue type
tissue_groups = expression_with_meta.groupby('tissue_type')

# Analyze expression patterns
tissue_expression = {}
for tissue, group in tissue_groups:
    tissue_expression[tissue] = {
        'median_expression': group.groupby('gene_name')['TPM'].median(),
        'sample_count': len(group),
        'expressed_genes': (group['TPM'] > 1).sum()
    }

# Plot tissue-specific expression
plt.figure(figsize=(12, 6))
tissue_medians = pd.DataFrame({
    tissue: data['median_expression']
    for tissue, data in tissue_expression.items()
})
sns.boxplot(data=tissue_medians)
plt.xticks(rotation=45)
plt.title('Gene Expression Distribution by Tissue')

## Strong Binder Analysis

In [None]:
# Get strong binders
strong_binders = predictions[predictions['%Rank_EL'] < 2]

# Analyze source proteins/genes
binder_stats = pd.DataFrame({
    'total_binders': strong_binders.groupby('gene_name')['Peptide'].count(),
    'unique_peptides': strong_binders.groupby('gene_name')['Peptide'].nunique(),
    'median_rank': strong_binders.groupby('gene_name')['%Rank_EL'].median()
}).sort_values('total_binders', ascending=False)

display(binder_stats.head(20))

## Cancer/Normal Comparison

In [None]:
# Separate cancer and normal samples
cancer_samples = metadata[metadata['sample_type'] == 'Tumor']['sample_id']
normal_samples = metadata[metadata['sample_type'] == 'Normal']['sample_id']

# Get predictions for each group
cancer_predictions = predictions[predictions['sample_id'].isin(cancer_samples)]
normal_predictions = predictions[predictions['sample_id'].isin(normal_samples)]

# Analyze differential presentation
def analyze_differential_presentation(cancer_df, normal_df):
    cancer_counts = cancer_df.groupby('gene_name')['Peptide'].nunique()
    normal_counts = normal_df.groupby('gene_name')['Peptide'].nunique()
    
    diff_df = pd.DataFrame({
        'cancer_peptides': cancer_counts,
        'normal_peptides': normal_counts
    }).fillna(0)
    
    diff_df['fold_change'] = np.log2((diff_df['cancer_peptides'] + 1) / 
                                   (diff_df['normal_peptides'] + 1))
    
    diff_df['total_peptides'] = diff_df['cancer_peptides'] + diff_df['normal_peptides']
    
    return diff_df.sort_values('fold_change', ascending=False)

differential_presentation = analyze_differential_presentation(
    cancer_predictions,
    normal_predictions
)

# Plot differential presentation
plt.figure(figsize=(12, 8))
plt.scatter(
    differential_presentation['normal_peptides'],
    differential_presentation['cancer_peptides'],
    alpha=0.5
)
plt.yscale('log'); plt.xscale('log')
plt.plot([0.1, 1000], [0.1, 1000], 'r--', alpha=0.5)
plt.xlabel('Normal Peptides')
plt.ylabel('Cancer Peptides')
plt.title('Cancer vs Normal Peptide Presentation')

## Gene Set Analysis

In [None]:
# Load gene sets (e.g., cancer testis antigens, tumor suppressors)
gene_sets = {
    'CT_antigens': ['MAGEA1', 'MAGEA3', 'NY-ESO-1', 'PRAME'],
    'Tumor_suppressors': ['TP53', 'PTEN', 'RB1', 'BRCA1'],
    'Oncogenes': ['MYC', 'KRAS', 'BRAF', 'PIK3CA']
}

# Analyze representation in predictions
def analyze_gene_sets(predictions_df, gene_sets):
    results = {}
    for set_name, genes in gene_sets.items():
        set_predictions = predictions_df[
            predictions_df['gene_name'].isin(genes)
        ]
        
        results[set_name] = {
            'total_peptides': len(set_predictions),
            'unique_peptides': set_predictions['Peptide'].nunique(),
            'genes_found': set_predictions['gene_name'].nunique(),
            'median_rank': set_predictions['%Rank_EL'].median(),
            'strong_binders': (set_predictions['%Rank_EL'] < 2).sum()
        }
    
    return pd.DataFrame(results).T

gene_set_analysis = analyze_gene_sets(predictions, gene_sets)
display(gene_set_analysis)

## Expression Correlation Analysis

In [None]:
# Correlate peptide presentation with expression
def analyze_expression_correlation(predictions_df, expression_df):
    merged_data = []
    
    for gene in predictions_df['gene_name'].unique():
        peptide_count = len(predictions_df[
            predictions_df['gene_name'] == gene
        ])
        
        expression = expression_df[
            expression_df['gene_name'] == gene
        ]['TPM'].median()
        
        merged_data.append({
            'gene': gene,
            'peptide_count': peptide_count,
            'expression': expression
        })
    
    return pd.DataFrame(merged_data)

correlation_data = analyze_expression_correlation(
    predictions,
    expression_with_meta
)

# Plot correlation
plt.figure(figsize=(10, 6))
plt.scatter(
    np.log2(correlation_data['expression'] + 1),
    np.log2(correlation_data['peptide_count'] + 1),
    alpha=0.5
)
plt.xlabel('log2(TPM + 1)')
plt.ylabel('log2(Peptide Count + 1)')
plt.title('Expression vs Peptide Presentation')

# Calculate correlation coefficient
corr = np.corrcoef(
    np.log2(correlation_data['expression'] + 1),
    np.log2(correlation_data['peptide_count'] + 1)
)[0,1]
plt.text(0.05, 0.95, f'Correlation: {corr:.2f}', 
         transform=plt.gca().transAxes)

## Save Analysis Results

In [None]:
# Create results directory
results_dir = Path("analysis_results")
results_dir.mkdir(exist_ok=True)

# Save key results
differential_presentation.to_csv(
    results_dir / "differential_presentation.csv"
)
gene_set_analysis.to_csv(
    results_dir / "gene_set_analysis.csv"
)
correlation_data.to_csv(
    results_dir / "expression_correlation.csv"
)

print("Analysis results saved to:", results_dir)