# Protein Structure Hunter — MCP-powered Agent Demo

This Kaggle-style notebook provides a runnable demo that:

- Writes a combined MCP FastAPI server (`mcp_server.py`) exposing UniProt, PDBe, and AlphaFold endpoints.
- Writes MCP descriptor files under `tools/` for UniProt, PDB, and AlphaFold.
- Shows MCP client wrappers that call the local server, with safe fallbacks to direct REST when MCP is not running.
- Starts the MCP server in the background (recommended: use `nohup` on Kaggle).
- Runs an async orchestrator that: UniProt → (PDB + AlphaFold in parallel) → scoring → selection.

**How to use:**
1. Run the cells in order. If running on Kaggle, use the `nohup` start method in the server-start cell to keep the server alive.
2. If the environment blocks background servers, the client wrappers gracefully fall back to direct REST calls.

**Note:** This notebook calls public APIs (UniProt, PDBe, AlphaFold). Ensure your environment allows outbound internet.


## 1) Install dependencies

If you need to install dependencies in your environment, uncomment and run the pip command in the next cell.

In [None]:
# Install dependencies if necessary (uncomment to run)
# !pip install fastapi uvicorn requests biopython
print('If you plan to run the MCP server locally, ensure fastapi, uvicorn, and requests are installed.')

In [None]:
mcp_code = r'''# mcp_server.py
# Combined MCP server: UniProt, PDBe mapping/metadata, AlphaFold prediction endpoint wrappers.
import uvicorn
from fastapi import FastAPI, HTTPException
import requests

app = FastAPI(title="Bio MCP Tools (UniProt, PDBe, AlphaFold)")

# --- UniProt endpoints ---
UNIPROT_SEARCH = "https://rest.uniprot.org/uniprotkb/search"
UNIPROT_ENTRY = "https://rest.uniprot.org/uniprotkb/{}.json"

@app.get("/uniprot/search")
def search_uniprot(query: str, limit: int = 5):
    params = {"query": query, "format": "json", "size": limit}
    r = requests.get(UNIPROT_SEARCH, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    return {"results": [item.get("primaryAccession") for item in data.get("results", [])]}

@app.get("/uniprot/entry")
def get_uniprot_entry(accession: str):
    r = requests.get(UNIPROT_ENTRY.format(accession), timeout=30)
    r.raise_for_status()
    raw = r.json()
    go_terms = []
    for xref in raw.get("uniProtKBCrossReferences", []):
        if xref.get("database") == "GO":
            for prop in xref.get("properties", []):
                val = prop.get("value")
                if val:
                    go_terms.append(val)
    seq = raw.get("sequence", {}).get("value")
    name = raw.get("proteinDescription", {})\
        .get("recommendedName", {})\
        .get("fullName", {})\
        .get("value")
    return {"accession": accession, "name": name, "sequence": seq, "length": len(seq) if seq else None, "go_terms": sorted(list(set(go_terms)))}

# --- PDBe mapping and summary endpoints ---
PDBe_MAP = "https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{}"
PDBe_SUMMARY = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/{}"

@app.get("/pdb/search_by_uniprot")
def pdb_search_by_uniprot(accession: str):
    r = requests.get(PDBe_MAP.format(accession), timeout=30)
    if r.status_code == 404:
        return {"mappings": []}
    r.raise_for_status()
    data = r.json().get(accession, {}).get("mappings", [])
    return {"mappings": data}

@app.get("/pdb/summary")
def pdb_summary(pdb_id: str):
    r = requests.get(PDBe_SUMMARY.format(pdb_id), timeout=30)
    if r.status_code == 404:
        raise HTTPException(status_code=404, detail="PDB ID not found")
    r.raise_for_status()
    items = r.json().get(pdb_id, [])
    if not items:
        raise HTTPException(status_code=404, detail="No summary for pdb_id")
    return items[0]

# --- AlphaFold API wrapper (EMBL-EBI Alphafold) ---
ALPHAFOLD_API = "https://alphafold.ebi.ac.uk/api/prediction/{}"

@app.get("/af/prediction")
def af_prediction(accession: str):
    r = requests.get(ALPHAFOLD_API.format(accession), timeout=30)
    if r.status_code == 404:
        return {"present": False, "data": None}
    r.raise_for_status()
    data = r.json()
    return {"present": True, "data": data}

if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=9001)
'''
with open('mcp_server.py','w') as f:
    f.write(mcp_code)
print('Wrote mcp_server.py')

In [None]:
import os, json
os.makedirs('tools/uniprot', exist_ok=True)
os.makedirs('tools/pdb', exist_ok=True)
os.makedirs('tools/af', exist_ok=True)

uniprot_mcp = {
  'name': 'uniprot_tool',
  'description': 'Search UniProt and fetch UniProtKB entries',
  'tools': {
    'search': {'endpoint': 'http://127.0.0.1:9001/uniprot/search', 'method': 'GET', 'params_schema': {'query': 'string', 'limit': 'number'}},
    'entry':  {'endpoint': 'http://127.0.0.1:9001/uniprot/entry',  'method': 'GET', 'params_schema': {'accession': 'string'}}
  }
}

pdb_mcp = {
  'name': 'pdb_tool',
  'description': 'Search PDB (PDBe) by UniProt accession and fetch summary metadata',
  'tools': {
    'search_by_uniprot': {'endpoint': 'http://127.0.0.1:9001/pdb/search_by_uniprot', 'method': 'GET', 'params_schema': {'accession': 'string'}},
    'summary': {'endpoint': 'http://127.0.0.1:9001/pdb/summary', 'method': 'GET', 'params_schema': {'pdb_id': 'string'}}
  }
}

af_mcp = {
  'name': 'af_tool',
  'description': 'Query AlphaFold DB predictions by UniProt accession',
  'tools': {
    'prediction': {'endpoint': 'http://127.0.0.1:9001/af/prediction', 'method': 'GET', 'params_schema': {'accession': 'string'}}
  }
}

with open('tools/uniprot/mcp.json','w') as f:
    json.dump(uniprot_mcp, f, indent=2)
with open('tools/pdb/mcp.json','w') as f:
    json.dump(pdb_mcp, f, indent=2)
with open('tools/af/mcp.json','w') as f:
    json.dump(af_mcp, f, indent=2)

print('Wrote tools/uniprot/mcp.json, tools/pdb/mcp.json, tools/af/mcp.json')

In [None]:
import os, time, requests
# Start the MCP server in background using nohup (recommended for Kaggle)
print('Starting mcp_server.py with nohup...')
# remove previous output
if os.path.exists('mcp_server.out'):
    try:
        os.remove('mcp_server.out')
    except:
        pass
os.system('nohup python mcp_server.py > mcp_server.out 2>&1 &')
# wait a bit for server to start
print('Waiting 2.5 seconds for server to initialize...')
time.sleep(2.5)
# quick health check
try:
    r = requests.get('http://127.0.0.1:9001/uniprot/search', params={'query':'TP53','limit':1}, timeout=5)
    print('Server responded: status', r.status_code)
except Exception as e:
    print('Server health-check failed:', e)
    print('Inspect mcp_server.out for logs. If missing packages, install them via pip.')
    if os.path.exists('mcp_server.out'):
        print('\n--- mcp_server.out (last 50 lines) ---')
        with open('mcp_server.out','r') as f:
            lines = f.readlines()[-50:]
            for L in lines:
                print(L.rstrip())


In [None]:
import requests
MCP_BASE = 'http://127.0.0.1:9001'

# UniProt
def mcp_uniprot_search(query, limit=5):
    r = requests.get(f"{MCP_BASE}/uniprot/search", params={'query': query, 'limit': limit}, timeout=20)
    r.raise_for_status()
    return r.json()

def mcp_uniprot_entry(accession):
    r = requests.get(f"{MCP_BASE}/uniprot/entry", params={'accession': accession}, timeout=20)
    r.raise_for_status()
    return r.json()

# PDB
def mcp_pdb_search_by_uniprot(accession):
    r = requests.get(f"{MCP_BASE}/pdb/search_by_uniprot", params={'accession': accession}, timeout=20)
    r.raise_for_status()
    return r.json()

def mcp_pdb_summary(pdb_id):
    r = requests.get(f"{MCP_BASE}/pdb/summary", params={'pdb_id': pdb_id}, timeout=20)
    r.raise_for_status()
    return r.json()

# AlphaFold
def mcp_af_prediction(accession):
    r = requests.get(f"{MCP_BASE}/af/prediction", params={'accession': accession}, timeout=20)
    r.raise_for_status()
    return r.json()

# Fallback direct REST
UNIPROT_ENTRY = 'https://rest.uniprot.org/uniprotkb/{}.json'
UNIPROT_SEARCH_SIMPLE = 'https://rest.uniprot.org/uniprotkb/search?query={query}&format=json&size=5'

def direct_uniprot_entry(accession):
    r = requests.get(UNIPROT_ENTRY.format(accession), timeout=30)
    r.raise_for_status()
    raw = r.json()
    go_terms = []
    for xref in raw.get('uniProtKBCrossReferences', []):
        if xref.get('database') == 'GO':
            for prop in xref.get('properties', []):
                val = prop.get('value')
                if val:
                    go_terms.append(val)
    seq = raw.get('sequence', {}).get('value')
    name = raw.get('proteinDescription', {})\
        .get('recommendedName', {})\
        .get('fullName', {})\
        .get('value')
    return {'accession': accession, 'name': name, 'sequence': seq, 'length': len(seq) if seq else None, 'go_terms': sorted(list(set(go_terms)))}

def direct_uniprot_search(query, limit=5):
    url = UNIPROT_SEARCH_SIMPLE.format(query=requests.utils.quote(query))
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    data = r.json()
    hits = [item.get('primaryAccession') for item in data.get('results', [])]
    return {'results': hits}

def direct_pdb_search_by_uniprot(accession):
    url = f'https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{accession}'
    r = requests.get(url, timeout=20)
    if r.status_code == 404:
        return {'mappings': []}
    r.raise_for_status()
    return {'mappings': r.json().get(accession, {}).get('mappings', [])}

def direct_pdb_summary(pdb_id):
    url = f'https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/{pdb_id}'
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    return r.json().get(pdb_id, [])[0]

def direct_af_prediction(accession):
    url = f'https://alphafold.ebi.ac.uk/api/prediction/{accession}'
    r = requests.get(url, timeout=20)
    if r.status_code == 404:
        return {'present': False, 'data': None}
    r.raise_for_status()
    return {'present': True, 'data': r.json()}

print('MCP client wrappers loaded')

In [None]:
# Orchestrator and scoring helpers
import asyncio
from functools import partial

def normalize_value(v, low, high, invert=False):
    if v is None:
        return 0.0
    try:
        vf = float(v)
    except:
        return 0.0
    if invert:
        vf = max(min(vf, high), low)
        return 1.0 - (vf - low) / (high - low)
    else:
        vf = max(min(vf, high), low)
        return (vf - low) / (high - low)

def score_structure(pdb_meta, mapping, af_info=None):
    coverage = 0.0
    if mapping:
        aligned = mapping.get('length_aligned') or 0
        unp_len = mapping.get('unp_length') or 1
        coverage = aligned / unp_len
    cov_score = max(0.0, min(1.0, coverage))
    resolution = pdb_meta.get('resolution') if pdb_meta else None
    res_score = normalize_value(resolution, 0.8, 20.0, invert=True)
    method = (pdb_meta.get('experimental_method','') or '').lower() if pdb_meta else ''
    method_score = 0.5
    if 'x-ray' in method:
        method_score = 1.0
    elif 'electron' in method or 'cryo' in method:
        method_score = 0.8
    elif 'nmr' in method:
        method_score = 0.6
    ligand_count = len(pdb_meta.get('ligands', [])) if pdb_meta and pdb_meta.get('ligands') else 0
    ligand_score = 1.0 if ligand_count > 0 else 0.5
    af_conf = 0.0
    if af_info and af_info.get('plddt_avg'):
        af_conf = normalize_value(af_info['plddt_avg'], 0, 100)
    if pdb_meta is None and af_info:
        score = 0.4 * cov_score + 0.4 * af_conf + 0.2 * 0.5
        components = {'cov': cov_score, 'af_conf': af_conf}
    else:
        score = (0.30 * cov_score) + (0.25 * ((res_score + method_score)/2.0)) + (0.15 * cov_score) + (0.10 * ligand_score) + (0.10 * 0.8) + (0.10 * 0.1)
        components = {'cov': cov_score, 'res_score': res_score, 'method_score': method_score, 'ligand_score': ligand_score}
    return score, components

async def process_protein_async(protein_query: str):
    try:
        hits = mcp_uniprot_search(protein_query, limit=3)
    except Exception as e:
        print('MCP UniProt search failed, using direct REST fallback:', e)
        hits = {'results': direct_uniprot_search(protein_query, limit=3).get('results', [])}
    if not hits.get('results'):
        print('No UniProt hits found for query:', protein_query)
        return None
    accession = hits['results'][0]
    try:
        uniprot_entry = mcp_uniprot_entry(accession)
    except Exception as e:
        print('MCP UniProt entry failed, fallback direct:', e)
        uniprot_entry = direct_uniprot_entry(accession)
    loop = asyncio.get_event_loop()
    tasks = [
        loop.run_in_executor(None, partial(mcp_pdb_search_by_uniprot, accession)),
        loop.run_in_executor(None, partial(mcp_af_prediction, accession))
    ]
    pdb_result, af_result = await asyncio.gather(*tasks, return_exceptions=False)
    mappings = pdb_result.get('mappings', []) if pdb_result else []
    candidates = []
    max_pdbs = 10
    for m in mappings[:max_pdbs]:
        pdb_id = m.get('pdb_id')
        try:
            pdb_meta = mcp_pdb_summary(pdb_id)
        except Exception as e:
            try:
                pdb_meta = direct_pdb_summary(pdb_id)
            except Exception as e2:
                pdb_meta = None
        score, comps = score_structure(pdb_meta, m, af_info=None)
        candidates.append({'source':'PDB','id':pdb_id,'chain':m.get('chain_id'),'mapping':m,'meta':pdb_meta,'score':score,'components':comps})
    if af_result and af_result.get('present'):
        af_models = af_result.get('data') or []
        plddt_vals = []
        for model in af_models:
            if isinstance(model, dict) and 'plddt' in model and isinstance(model['plddt'], list):
                plddt_vals.extend(model['plddt'])
        if plddt_vals:
            plddt_avg = sum(plddt_vals)/len(plddt_vals)
        else:
            plddt_avg = None
        af_info = {'plddt_avg': plddt_avg, 'source_id': f'AF-{accession}'}
        score, comps = score_structure(None, {'length_aligned': uniprot_entry.get('length',0), 'unp_length': uniprot_entry.get('length',1)}, af_info=af_info)
        candidates.append({'source':'AF','id':f'AF-{accession}','chain':None,'mapping':None,'meta':af_info,'score':score,'components':comps})
    ranked = sorted(candidates, key=lambda x: x.get('score',0), reverse=True)
    return {'accession': accession, 'uniprot': uniprot_entry, 'candidates': ranked}

def process_protein(protein_query):
    return asyncio.get_event_loop().run_until_complete(process_protein_async(protein_query))


In [None]:
# Demo: run the orchestrator for a protein (change the query as needed)
print('Running demo for TP53 human — this calls remote services...')
res = None
try:
    res = process_protein('TP53 human')
    if res:
        print('Accession:', res['accession'])
        print('Top candidates (id - score):')
        for c in res['candidates'][:5]:
            print(f"  {c['source']} {c['id']}  score={c['score']:.3f}  components={c['components']}")
    else:
        print('No results returned')
except Exception as e:
    print('Error running orchestrator:', e)
    print('If MCP server not reachable, ensure mcp_server.py is running (see earlier cells) and outbound internet is allowed.')

In [None]:
def docking_prep_for_candidate(candidate):
    lines = []
    src = candidate.get('source')
    cid = candidate.get('id')
    lines.append(f"Selected: {src} {cid} (score={candidate.get('score'):.3f})")
    if src == 'PDB':
        meta = candidate.get('meta') or {}
        chain = candidate.get('chain')
        lines.append(f"Chain: {chain}")
        if meta.get('resolution'):
            lines.append(f"Resolution: {meta.get('resolution')} Å (method: {meta.get('experimental_method')})")
        ligs = meta.get('ligands') or []
        if ligs:
            lines.append(f"Found ligands: {', '.join([l.get('chem_comp_id','?') for l in ligs])}")
        lines.append('Checklist:')
        lines.append(' - Download biological assembly if relevant (preferred) vs asymmetric unit')
        lines.append(' - Remove crystallographic waters unless required')
        lines.append(' - Keep cofactors if biologically relevant')
        lines.append(' - Protonate using pdb2pqr or reduce at pH 7.4')
    elif src == 'AF':
        meta = candidate.get('meta') or {}
        plddt = meta.get('plddt_avg')
        lines.append(f"AlphaFold model average pLDDT: {plddt}")
        lines.append('Checklist:')
        lines.append(' - Inspect pLDDT at docking site; model loops may be low-confidence')
        lines.append(' - Consider local refinement or short MD minimization before docking')
        lines.append(' - Protonate and energy-minimize using pdbfixer or OpenMM')
    return '\n'.join(lines)

if 'res' in globals() and res and res.get('candidates'):
    print('\n--- Docking prep for top candidate ---')
    print(docking_prep_for_candidate(res['candidates'][0]))
else:
    print('No candidate available for docking prep.')

## Final notes and next steps

- If the MCP server failed to start, inspect `mcp_server.out` for errors and ensure dependencies (`fastapi`, `uvicorn`, `requests`) are installed.
- On Kaggle, use the `nohup` method (used above) to keep the server running between cells.
- To extend this notebook: add sequence identity calculations (Biopython pairwise), site-aware pLDDT scoring, and a binding-site detector.
- For submission: include README.md with instructions, a short demo video, and the notebook as the runnable demonstration of the agent.

If you'd like, I can now package this notebook and the server script into a ZIP for upload to GitHub or Kaggle. Say **'package repo'** and I'll create a downloadable ZIP.