# Quick Start: Cryptic IP Binding Site Detection

This notebook demonstrates the complete workflow with real data from AlphaFold and PDB.

## Setup

Install required packages if running in Colab or Binder:

In [None]:
import sys
import os
from pathlib import Path

# Check if running in Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print('Running in Google Colab - installing dependencies...')
    !pip install -q biopython requests pandas matplotlib seaborn
    # Clone repository
    if not Path('cryptic-ip-binding-sites').exists():
        !git clone https://github.com/Tommaso-R-Marena/cryptic-ip-binding-sites.git
        os.chdir('cryptic-ip-binding-sites')
    sys.path.insert(0, str(Path.cwd()))
else:
    # Local or Binder
    sys.path.insert(0, str(Path.cwd().parent))

print('Setup complete!')


## 1. Download ADAR2 Structure from AlphaFold

Let's start by downloading a real protein structure.

In [None]:
import requests
from pathlib import Path

# Create data directory
data_dir = Path('notebook_data')
data_dir.mkdir(exist_ok=True)

# Download ADAR2 from AlphaFold
uniprot_id = 'P78563'  # ADAR2
alphafold_url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'

adar2_file = data_dir / f'AF-{uniprot_id}-F1-model_v4.pdb'

if not adar2_file.exists():
    print(f'Downloading ADAR2 structure from AlphaFold...')
    response = requests.get(alphafold_url)
    response.raise_for_status()
    with open(adar2_file, 'wb') as f:
        f.write(response.content)
    print(f'✓ Downloaded: {adar2_file}')
else:
    print(f'✓ Using cached: {adar2_file}')

print(f'\nFile size: {adar2_file.stat().st_size / 1024:.1f} KB')


## 2. Load and Inspect the Structure

Use BioPython to examine the protein structure.

In [None]:
from Bio import PDB

# Parse structure
parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure('ADAR2', str(adar2_file))
model = structure[0]

# Count residues and atoms
residues = [r for r in model.get_residues() if PDB.is_aa(r)]
atoms = list(model.get_atoms())

print(f'ADAR2 Structure Summary:')
print(f'  Residues: {len(residues)}')
print(f'  Atoms: {len(atoms)}')

# Extract pLDDT confidence scores (from B-factors in AlphaFold)
plddt_scores = [atom.bfactor for atom in atoms]
avg_plddt = sum(plddt_scores) / len(plddt_scores)

print(f'  Average pLDDT: {avg_plddt:.1f}')
print(f'\nConfidence interpretation:')
print(f'  pLDDT > 90: Very high confidence')
print(f'  pLDDT > 70: Confident')
print(f'  pLDDT > 50: Low confidence')
print(f'  pLDDT < 50: Very low confidence')


## 3. Visualize Confidence Scores

Plot the pLDDT scores along the sequence.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get per-residue pLDDT (average CA atoms)
residue_plddt = []
residue_numbers = []

for residue in residues:
    if 'CA' in residue:
        ca_atom = residue['CA']
        residue_plddt.append(ca_atom.bfactor)
        residue_numbers.append(residue.id[1])

# Plot
fig, ax = plt.subplots(figsize=(12, 4))

# Color by confidence
colors = []
for score in residue_plddt:
    if score > 90:
        colors.append('#0053D6')  # Dark blue
    elif score > 70:
        colors.append('#65CBF3')  # Light blue
    elif score > 50:
        colors.append('#FFDB13')  # Yellow
    else:
        colors.append('#FF7D45')  # Orange

ax.bar(residue_numbers, residue_plddt, color=colors, width=1.0)
ax.axhline(y=90, color='blue', linestyle='--', alpha=0.3, label='Very high')
ax.axhline(y=70, color='cyan', linestyle='--', alpha=0.3, label='Confident')
ax.axhline(y=50, color='yellow', linestyle='--', alpha=0.3, label='Low')

ax.set_xlabel('Residue Number')
ax.set_ylabel('pLDDT Confidence')
ax.set_title('ADAR2 AlphaFold Confidence Scores')
ax.set_ylim(0, 100)
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f'High confidence residues (>70): {sum(1 for s in residue_plddt if s > 70)} / {len(residue_plddt)}')


## 4. Find Basic Residues

Locate arginine, lysine, and histidine residues that could coordinate phosphates.

In [None]:
# Find basic residues
basic_residues = []

for residue in residues:
    if residue.resname in ['ARG', 'LYS', 'HIS']:
        basic_residues.append({
            'type': residue.resname,
            'number': residue.id[1],
            'chain': residue.parent.id
        })

print(f'Found {len(basic_residues)} basic residues:')
print(f'  Arginine (ARG): {sum(1 for r in basic_residues if r["type"] == "ARG")}')
print(f'  Lysine (LYS): {sum(1 for r in basic_residues if r["type"] == "LYS")}')
print(f'  Histidine (HIS): {sum(1 for r in basic_residues if r["type"] == "HIS")}')

# Known IP6-binding residues in ADAR2 (from Macbeth et al. 2005)
known_ip6_residues = [376, 519, 522, 651, 672]

print(f'\nKnown IP6-coordinating residues from crystal structure (1ZY7):')
for res_num in known_ip6_residues:
    matching = [r for r in basic_residues if r['number'] == res_num]
    if matching:
        print(f'  Residue {res_num}: {matching[0]["type"]}')


## 5. Summary

This notebook demonstrated:

1. ✓ Downloading real structural data from AlphaFold
2. ✓ Loading and parsing PDB files with BioPython
3. ✓ Extracting and visualizing confidence scores
4. ✓ Identifying basic residues for IP coordination

**Next Steps:**
- See `02_ADAR2_Analysis.ipynb` for complete pocket detection
- See `04_Validation_Analysis.ipynb` for positive/negative controls
- See `03_Proteome_Screening.ipynb` for large-scale screening