# Results Analysis: Proteome-Wide Screening

This notebook analyzes the results from proteome-wide screening for cryptic IP binding sites
across yeast, human, and Dictyostelium proteomes.

## Analysis Goals

1. Load and filter screening results
2. Identify high-confidence candidate proteins
3. Perform functional enrichment analysis
4. Compare hit rates across organisms
5. Test evolutionary co-evolution hypothesis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats

sns.set_style('whitegrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Screening Results

In [None]:
# Load results from all three proteomes
yeast_results = pd.read_csv('results/yeast_screening_results.csv')
human_results = pd.read_csv('results/human_screening_results.csv')
dicty_results = pd.read_csv('results/dictyostelium_screening_results.csv')

# Add organism labels
yeast_results['organism'] = 'S. cerevisiae'
human_results['organism'] = 'H. sapiens'
dicty_results['organism'] = 'D. discoideum'

# Combine all results
all_results = pd.concat([yeast_results, human_results, dicty_results], ignore_index=True)

print(f'Total proteins screened: {len(all_results):,}')
print(f'  Yeast: {len(yeast_results):,}')
print(f'  Human: {len(human_results):,}')
print(f'  Dictyostelium: {len(dicty_results):,}')

## 2. Score Distribution Across Organisms

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Score distribution by organism
sns.violinplot(data=all_results, x='organism', y='composite_score', ax=axes[0])
axes[0].axhline(0.7, color='red', linestyle='--', label='Threshold')
axes[0].set_ylabel('Composite Score', fontsize=12)
axes[0].set_xlabel('')
axes[0].set_title('Score Distribution by Organism', fontsize=14, fontweight='bold')
axes[0].legend()

# Cumulative distribution
for organism in all_results['organism'].unique():
    org_data = all_results[all_results['organism'] == organism]['composite_score']
    axes[1].hist(org_data, bins=50, alpha=0.6, label=organism, cumulative=True, density=True)

axes[1].axvline(0.7, color='red', linestyle='--', linewidth=2, label='Threshold')
axes[1].set_xlabel('Composite Score', fontsize=12)
axes[1].set_ylabel('Cumulative Frequency', fontsize=12)
axes[1].set_title('Cumulative Score Distribution', fontsize=14, fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.savefig('results/analysis/score_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Identify High-Confidence Candidates

Apply scoring threshold and pLDDT filtering.

In [None]:
# Define thresholds
SCORE_THRESHOLD = 0.7
PLDDT_THRESHOLD = 70.0

# Filter candidates
candidates = all_results[
    (all_results['composite_score'] >= SCORE_THRESHOLD) &
    (all_results['avg_plddt'] >= PLDDT_THRESHOLD)
].copy()

candidates = candidates.sort_values('composite_score', ascending=False)

print(f'High-confidence candidates: {len(candidates)}')
print('\nBy organism:')
for org in candidates['organism'].unique():
    count = len(candidates[candidates['organism'] == org])
    total = len(all_results[all_results['organism'] == org])
    pct = 100 * count / total
    print(f'  {org}: {count} ({pct:.2f}%)')

print('\nTop 10 candidates:')
candidates[['uniprot_id', 'protein_name', 'organism', 'composite_score']].head(10)

## 4. Hit Rate Comparison

Test the evolutionary hypothesis: does Dictyostelium (high IP6) have more buried sites?

In [None]:
# Calculate hit rates
hit_rates = []
ip6_concentrations = {'S. cerevisiae': 20, 'H. sapiens': 35, 'D. discoideum': 520}  # µM

for organism in all_results['organism'].unique():
    total = len(all_results[all_results['organism'] == organism])
    hits = len(candidates[candidates['organism'] == organism])
    rate = 100 * hits / total
    
    hit_rates.append({
        'Organism': organism,
        'Hit Rate (%)': rate,
        'Hits': hits,
        'Total': total,
        'IP6 Concentration (µM)': ip6_concentrations[organism]
    })

hit_rate_df = pd.DataFrame(hit_rates)
print(hit_rate_df)

# Test correlation
correlation, p_value = stats.spearmanr(
    hit_rate_df['IP6 Concentration (µM)'],
    hit_rate_df['Hit Rate (%)']
)

print(f'\nCorrelation (IP6 concentration vs hit rate): {correlation:.3f}')
print(f'P-value: {p_value:.3f}')

## 5. Evolutionary Analysis Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot of hit rates
colors = ['#3498db', '#e74c3c', '#2ecc71']
bars = axes[0].bar(hit_rate_df['Organism'], hit_rate_df['Hit Rate (%)'], 
                    color=colors, edgecolor='black', linewidth=2)
axes[0].set_ylabel('Hit Rate (%)', fontsize=12)
axes[0].set_title('Cryptic IP Binding Site Prevalence by Organism', 
                   fontsize=14, fontweight='bold')
axes[0].set_ylim(0, max(hit_rate_df['Hit Rate (%)']) * 1.3)

# Add value labels
for bar, rate in zip(bars, hit_rate_df['Hit Rate (%)']):
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height,
                 f'{rate:.2f}%',
                 ha='center', va='bottom', fontsize=12, fontweight='bold')

# Scatter plot: IP6 concentration vs hit rate
axes[1].scatter(hit_rate_df['IP6 Concentration (µM)'], 
                hit_rate_df['Hit Rate (%)'],
                s=300, c=colors, edgecolors='black', linewidth=2)

# Add organism labels
for idx, row in hit_rate_df.iterrows():
    axes[1].annotate(row['Organism'].split()[0],  # Genus only
                     (row['IP6 Concentration (µM)'], row['Hit Rate (%)']),
                     xytext=(10, 10), textcoords='offset points',
                     fontsize=11, fontweight='bold')

# Fit line if significant correlation
if abs(correlation) > 0.5 and p_value < 0.1:
    z = np.polyfit(hit_rate_df['IP6 Concentration (µM)'], hit_rate_df['Hit Rate (%)'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(0, 550, 100)
    axes[1].plot(x_line, p(x_line), 'r--', linewidth=2, alpha=0.7,
                 label=f'Correlation: {correlation:.2f}')
    axes[1].legend()

axes[1].set_xlabel('Intracellular IP6 Concentration (µM)', fontsize=12)
axes[1].set_ylabel('Hit Rate (%)', fontsize=12)
axes[1].set_title('Co-Evolution Test: IP6 Availability vs Buried Site Frequency',
                   fontsize=14, fontweight='bold')
axes[1].set_xlim(0, 550)

plt.tight_layout()
plt.savefig('results/analysis/evolutionary_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Functional Enrichment Analysis

Identify enriched GO terms and protein families among candidates.

In [None]:
# Placeholder for GO enrichment (requires goatools or similar)
# This would typically interface with UniProt or GO databases

# Analyze protein families
if 'protein_family' in candidates.columns:
    family_counts = candidates['protein_family'].value_counts().head(10)
    
    plt.figure(figsize=(12, 6))
    family_counts.plot(kind='barh', color='steelblue', edgecolor='black')
    plt.xlabel('Number of Candidates', fontsize=12)
    plt.ylabel('Protein Family', fontsize=12)
    plt.title('Top Protein Families in Candidate Set', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('results/analysis/protein_families.png', dpi=300, bbox_inches='tight')
    plt.show()

# Subcellular localization
if 'localization' in candidates.columns:
    loc_counts = candidates['localization'].value_counts()
    
    plt.figure(figsize=(10, 10))
    plt.pie(loc_counts.values, labels=loc_counts.index, autopct='%1.1f%%',
            startangle=90, colors=sns.color_palette('Set2'))
    plt.title('Subcellular Localization of Candidates', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('results/analysis/localization.png', dpi=300, bbox_inches='tight')
    plt.show()

## 7. Export Candidate List for Experimental Validation

In [None]:
# Export top candidates
top_candidates = candidates.head(50)

output_columns = ['uniprot_id', 'protein_name', 'organism', 
                  'composite_score', 'pocket_volume', 'pocket_depth',
                  'sasa', 'electrostatic_potential', 'basic_residues',
                  'avg_plddt', 'protein_length']

top_candidates[output_columns].to_csv(
    'results/analysis/top_50_candidates_for_validation.csv',
    index=False
)

print('Top 50 candidates exported for experimental validation')
print('\nRecommended experiments:')
print('1. Differential Scanning Fluorimetry (DSF) with/without IP6')
print('2. Site-directed mutagenesis of predicted coordinating residues')
print('3. Mass spectrometry to confirm IP occupancy')
print('4. Growth assays in IP biosynthesis mutants (yeast)')

## Conclusions

This analysis reveals:

1. **Hit prevalence**: X% of yeast, Y% of human, Z% of Dictyostelium proteins have candidate buried IP sites
2. **Evolutionary pattern**: [Correlation present/absent] between intracellular IP6 and hit rate
3. **Functional enrichment**: Candidates enriched for [nuclear/chromatin/RNA-binding] proteins
4. **Validation targets**: 50 high-confidence candidates prioritized for experimental testing

**Next Steps**:
- Manual structural inspection of top candidates
- Conservation analysis of predicted binding sites
- Experimental validation in collaboration with wet lab
- Manuscript preparation