In [3]:
"""
Task 4: ChEMBL Molecule Search (EGFR)

Objective:
Query ChEMBL API for molecules targeting EGFR
and extract activity values + molecule metadata.
"""

import requests
import json

BASE_URL = "https://www.ebi.ac.uk/chembl/api/data/activity"


def fetch_egfr_activities(limit=50):
    """
    Fetch activity data for EGFR (CHEMBL203)
    """
    params = {
        "target_chembl_id": "CHEMBL203",
        "limit": limit,
        "format": "json"
    }

    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()

    return response.json()


def extract_molecule_data(data):
    """
    Extract required molecule + activity fields
    """
    molecules = []

    activities = data.get("activities", [])

    for activity in activities:
        try:
            # Only consider entries with valid activity values
            if activity.get("standard_value") and activity.get("standard_type"):

                molecule_entry = {
                    "molecule_chembl_id": activity.get("molecule_chembl_id"),
                    "activity_type": activity.get("standard_type"),
                    "activity_value": activity.get("standard_value"),
                    "activity_units": activity.get("standard_units"),
                    "assay_description": activity.get("assay_description")
                }

                molecules.append(molecule_entry)

        except Exception:
            continue

        # Stop after collecting at least 20 valid molecules
        if len(molecules) >= 20:
            break

    return molecules


def save_to_json(data, filename="egfr_molecules.json"):
    """
    Save output as pure JSON array
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

    print(f"Saved {len(data)} molecules to {filename}")


def main():
    print("Fetching EGFR activity data from ChEMBL...")
    raw_data = fetch_egfr_activities(limit=100)

    print("Extracting molecule information...")
    molecules = extract_molecule_data(raw_data)

    print(f"Total molecules extracted: {len(molecules)}")

    save_to_json(molecules)


if __name__ == "__main__":
    main()

Fetching EGFR activity data from ChEMBL...
Extracting molecule information...
Total molecules extracted: 20
Saved 20 molecules to egfr_molecules.json
