In [23]:
import requests  
import pandas as pd  

# Step 1: Define a list of drug names we are interested in
drug_names = ['paracetamol', 'ibuprofen', 'celecoxib']

# Step 2: Create an empty list to collect drug information
drug_data = []

# Step 3: Loop through each drug to query ChEMBL and retrieve SMILES
for drug in drug_names:
    # Use ChEMBL API to search for the drug and get its ChEMBL ID
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/search?q={drug}"
    response = requests.get(url, headers={"Accept": "application/json"})

    # If the search is successful (status code 200)
    if response.status_code == 200:
        results = response.json()
        # Get the first hit from the search result
        if results['molecules']:
            chembl_id = results['molecules'][0]['molecule_chembl_id']

            # Use the ChEMBL ID to get molecule details (like SMILES)
            mol_url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/{chembl_id}.json"
            mol_response = requests.get(mol_url)

            if mol_response.status_code == 200:
                mol_data = mol_response.json()

                # Extract canonical SMILES string
                smiles = mol_data.get('molecule_structures', {}).get('canonical_smiles', 'NA')

                # Append results to the list
                drug_data.append({'drug': drug, 'chembl_id': chembl_id, 'smiles': smiles})
    else:
        print(f"Failed to retrieve data for {drug}")

# Step 4: Convert collected data into a pandas DataFrame
df_smiles = pd.DataFrame(drug_data)

# Step 5: Display the results
print("SMILES DataFrame:")
print(df_smiles)

# Optional: Save to CSV
df_smiles.to_csv("drug_smiles.csv", index=False)

SMILES DataFrame:
          drug  chembl_id                                             smiles
0  paracetamol  CHEMBL112                                 CC(=O)Nc1ccc(O)cc1
1    ibuprofen  CHEMBL521                         CC(C)Cc1ccc(C(C)C(=O)O)cc1
2    celecoxib  CHEMBL118  Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2...


In [24]:
# Step 1: Define UniProt IDs for COX-1 and COX-2
protein_ids = {
    "COX1": "P23219",  # PTGS1 - Cyclooxygenase-1
    "COX2": "P35354"   # PTGS2 - Cyclooxygenase-2
}

# Step 2: Create a dictionary to store protein sequences
protein_seqs = {}

# Step 3: Fetch sequences using UniProt REST API
for name, uniprot_id in protein_ids.items():
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)

    if response.status_code == 200:
        fasta_data = response.text

        # Remove the FASTA header (starting with '>') and join the sequence lines
        sequence = ''.join(fasta_data.split('\n')[1:])
        protein_seqs[name] = sequence
    else:
        print(f"Error fetching {name} sequence.")

# Step 4: Display a preview of the sequences
for name, seq in protein_seqs.items():
    print(f"\n{name} Sequence (first 100 amino acids):\n{seq[:100]}...\n")

# Optional: Save to FASTA files
with open("COX1.fasta", "w") as f:
    f.write(f">COX1|P23219\n{protein_seqs['COX1']}")

with open("COX2.fasta", "w") as f:
    f.write(f">COX2|P35354\n{protein_seqs['COX2']}")




COX1 Sequence (first 100 amino acids):
MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCTIPGLWTWLRNSLRPSPSFTHFLLTHGRWFWE...


COX2 Sequence (first 100 amino acids):
MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYILTHFKGFWNVVNNIPFLRNAIMS...



In [25]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd

# Example: Load SMILES data from the previous step
df_smiles = pd.read_csv("drug_smiles.csv")

# Function to generate Morgan fingerprint
def generate_morgan_fp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
        return list(fp)
    else:
        return [0] * nBits  # Return zero vector if SMILES is invalid

# Apply the function to all SMILES
df_smiles["fingerprint"] = df_smiles["smiles"].apply(generate_morgan_fp)

# Expand fingerprint list into separate columns
fingerprint_df = pd.DataFrame(df_smiles["fingerprint"].tolist(), index=df_smiles.index)
fingerprint_df.columns = [f"FP_{i}" for i in range(fingerprint_df.shape[1])]

# Combine with drug name
ligand_features = pd.concat([df_smiles[["drug", "chembl_id"]], fingerprint_df], axis=1)

# Preview
print(ligand_features.head())

# Save for later steps
ligand_features.to_csv("ligand_features.csv", index=False)


          drug  chembl_id  FP_0  FP_1  FP_2  FP_3  FP_4  FP_5  FP_6  FP_7  \
0  paracetamol  CHEMBL112     0     0     0     0     0     0     0     0   
1    ibuprofen  CHEMBL521     0     1     0     0     0     0     0     0   
2    celecoxib  CHEMBL118     0     0     0     0     0     0     0     0   

   ...  FP_1014  FP_1015  FP_1016  FP_1017  FP_1018  FP_1019  FP_1020  \
0  ...        0        0        0        1        0        0        0   
1  ...        0        0        0        0        0        0        0   
2  ...        0        0        0        0        0        0        0   

   FP_1021  FP_1022  FP_1023  
0        0        0        0  
1        0        0        0  
2        0        0        0  

[3 rows x 1026 columns]




In [26]:
# Define the function (taught in class)
def aa_composition(seq):
    freq = {}
    for aa in 'ACDEFGHIKLMNPQRSTVWY':
        freq[aa] = seq.count(aa) / len(seq)
    return pd.Series(freq)

# Example input (from Step 1B)
protein_seqs = {
    "COX1": open("COX1.fasta").read().split("\n", 1)[1].replace("\n", ""),
    "COX2": open("COX2.fasta").read().split("\n", 1)[1].replace("\n", "")
}

# Apply the function
protein_features = pd.DataFrame({
    name: aa_composition(seq) for name, seq in protein_seqs.items()
}).T.reset_index().rename(columns={"index": "protein"})

# Preview
print(protein_features)

# Save for next stage
protein_features.to_csv("protein_features.csv", index=False)



  protein         A         C         D         E         F         G  \
0    COX1  0.041736  0.021703  0.040067  0.060100  0.065109  0.075125   
1    COX2  0.051325  0.021523  0.043046  0.059603  0.062914  0.061258   

          H         I         K  ...         M         N         P         Q  \
0  0.030050  0.041736  0.041736  ...  0.030050  0.031720  0.076795  0.045075   
1  0.031457  0.056291  0.056291  ...  0.024834  0.048013  0.066225  0.051325   

          R         S         T         V         W         Y  
0  0.055092  0.055092  0.050083  0.053422  0.016694  0.045075  
1  0.044702  0.057947  0.056291  0.057947  0.009934  0.044702  

[2 rows x 21 columns]


In [27]:
import pandas as pd
import itertools

# Step 1: Load ligand and protein features
ligands = pd.read_csv("ligand_features.csv")
proteins = pd.read_csv("protein_features.csv")

# Step 2: Create all combinations of ligand-protein pairs
pairs = list(itertools.product(ligands.index, proteins.index))

# Step 3: Initialize an empty list to hold merged feature vectors
merged_data = []

for ligand_idx, protein_idx in pairs:
    ligand_row = ligands.iloc[ligand_idx]
    protein_row = proteins.iloc[protein_idx]

    # Combine info into one row
    combined_row = {
        'drug': ligand_row['drug'],
        'chembl_id': ligand_row['chembl_id'],
        'protein': protein_row['protein']
    }

    # Add all fingerprint columns
    for col in ligand_row.index:
        if col.startswith("FP_"):
            combined_row[f"lig_{col}"] = ligand_row[col]

    # Add all protein feature columns
    for aa in 'ACDEFGHIKLMNPQRSTVWY':
        combined_row[f"prot_{aa}"] = protein_row[aa]

    merged_data.append(combined_row)

# Step 4: Convert to DataFrame
final_df = pd.DataFrame(merged_data)

# Step 5: Preview and Save
print(final_df.head())
final_df.to_csv("drug_protein_features.csv", index=False)


          drug  chembl_id protein  lig_FP_0  lig_FP_1  lig_FP_2  lig_FP_3  \
0  paracetamol  CHEMBL112    COX1         0         0         0         0   
1  paracetamol  CHEMBL112    COX2         0         0         0         0   
2    ibuprofen  CHEMBL521    COX1         0         1         0         0   
3    ibuprofen  CHEMBL521    COX2         0         1         0         0   
4    celecoxib  CHEMBL118    COX1         0         0         0         0   

   lig_FP_4  lig_FP_5  lig_FP_6  ...    prot_M    prot_N    prot_P    prot_Q  \
0         0         0         0  ...  0.030050  0.031720  0.076795  0.045075   
1         0         0         0  ...  0.024834  0.048013  0.066225  0.051325   
2         0         0         0  ...  0.030050  0.031720  0.076795  0.045075   
3         0         0         0  ...  0.024834  0.048013  0.066225  0.051325   
4         0         0         0  ...  0.030050  0.031720  0.076795  0.045075   

     prot_R    prot_S    prot_T    prot_V    prot_W    p

In [28]:
#6
import pandas as pd

# Step 1: Load your feature dataset
df = pd.read_csv("drug_protein_features.csv")

# Step 2: Define known positive interactions (from literature)
positive_pairs = [
    ("paracetamol", "COX1"),
    ("ibuprofen", "COX1"),
    ("ibuprofen", "COX2"),
    ("celecoxib", "COX2")
]

# Step 3: Assign labels
df["label"] = df.apply(lambda row: 1 if (row["drug"], row["protein"]) in positive_pairs else 0, axis=1)

# Step 4: Preview and save
print(df[["drug", "protein", "label"]])
df.to_csv("final_dataset_labeled.csv", index=False)


          drug protein  label
0  paracetamol    COX1      1
1  paracetamol    COX2      0
2    ibuprofen    COX1      1
3    ibuprofen    COX2      1
4    celecoxib    COX1      0
5    celecoxib    COX2      1


In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Step 1: Load the labeled feature dataset
df = pd.read_csv("final_dataset_labeled.csv")

# Step 2: Prepare feature matrix (X) and label vector (y)
# Drop non-numeric columns ('drug', 'protein', 'chembl_id', and 'label') for model input
X = df.drop(columns=["drug", "protein", "chembl_id", "label"])
y = df["label"]

# Step 3: Split the dataset into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 4: Initialize and train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # for ROC-AUC

# Step 6: Evaluate model performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob):.3f}")




Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

Confusion Matrix:
[[0 1]
 [0 1]]
ROC-AUC Score: 0.000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [30]:
#8
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Step 1: Load the labeled feature dataset
df = pd.read_csv("final_dataset_labeled.csv")

# Step 2: Prepare feature matrix (X) and label vector (y)
X = df.drop(columns=["drug", "protein", "chembl_id", "label"])
y = df["label"]

# Step 3: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 4: Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

# Step 6: Evaluate model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob):.3f}")











Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

Confusion Matrix:
[[0 1]
 [0 1]]
ROC-AUC Score: 0.000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [31]:
# AUGUST 8TH 2 CODE

In [None]:
#GOOD CODE
import requests
import pandas as pd

# Define ChEMBL Target IDs for COX1 and COX2
targets = {
    "COX1": "CHEMBL2095188",
    "COX2": "CHEMBL2094253"
}

# Parameters to fetch only IC50 values
base_params = {
    "standard_type": "IC50",
    "limit": 1000
}

# Function to fetch activity data from ChEMBL
def fetch_activity_data(target_name, chembl_id):
    all_data = []
    offset = 0

    while True:
        params = base_params.copy()
        params.update({
            "target_chembl_id": chembl_id,
            "offset": offset
        })

        url = "https://www.ebi.ac.uk/chembl/api/data/activity.json"
        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"Error retrieving {target_name} data.")
            break

        data = response.json()
        activities = data.get("activities", [])
        if not activities:
            break

        all_data.extend(activities)
        offset += 1000

    print(f"{target_name}: Retrieved {len(all_data)} raw activities.")
    return pd.DataFrame(all_data)

# Step 1: Download activity data for COX1 and COX2
df_cox1 = fetch_activity_data("COX1", targets["COX1"])
df_cox2 = fetch_activity_data("COX2", targets["COX2"])

# Step 2: Clean/filter IC50 values in nanomolar (nM)
def clean_and_filter(df):
    df = df[df["standard_value"].notna()]               # Remove missing IC50s
    df = df[df["standard_units"] == "nM"]                # Only nanomolar units
    df = df[df["standard_type"] == "IC50"]               # Only IC50 values
    df = df[df["canonical_smiles"].notna()]              # Make sure SMILES exist
    return df[["canonical_smiles", "standard_value"]]    # Keep only needed columns

df_cox1_clean = clean_and_filter(df_cox1)
df_cox2_clean = clean_and_filter(df_cox2)

# Step 3: Balance the two datasets (same number of ligands)
min_len = min(len(df_cox1_clean), len(df_cox2_clean))
df_cox1_bal = df_cox1_clean.sample(n=min_len, random_state=42)
df_cox2_bal = df_cox2_clean.sample(n=min_len, random_state=42)

# Step 4: Add binary labels (0 = COX1, 1 = COX2)
df_cox1_bal["label"] = 0
df_cox2_bal["label"] = 1

# Step 5: Merge and shuffle the dataset
df_combined = pd.concat([df_cox1_bal, df_cox2_bal], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 6: Save to CSV
df_combined.to_csv("balanced_cox1_2_ic50.csv", index=False)
print(f"Final balanced dataset: {len(df_combined)} rows ({min_len} per class)")


COX1: Retrieved 653 raw activities.
COX2: Retrieved 192 raw activities.
Final balanced dataset: 374 rows (187 per class)


In [2]:
import requests
import pandas as pd

# Target ChEMBL IDs
targets = {
    "COX1": "CHEMBL2095188",
    "COX2": "CHEMBL2094253"
}

# Base API parameters
base_params = {
    "standard_type": "IC50",
    "limit": 1000  # fetch max per call
}

# Function to download activities
def fetch_activity_data(target_name, chembl_id):
    all_data = []
    offset = 0

    while True:
        params = base_params.copy()
        params.update({
            "target_chembl_id": chembl_id,
            "offset": offset
        })

        url = "https://www.ebi.ac.uk/chembl/api/data/activity.json"
        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"Error retrieving {target_name} data.")
            break

        data = response.json()
        activities = data.get("activities", [])
        if not activities:
            break

        all_data.extend(activities)
        offset += 1000

    print(f"{target_name}: Retrieved {len(all_data)} raw activities.")
    return pd.DataFrame(all_data)

# Step 1: Fetch activities
df_cox1 = fetch_activity_data("COX1", targets["COX1"])
df_cox2 = fetch_activity_data("COX2", targets["COX2"])

# Step 2: Filter the data
def clean_and_filter(df):
    df = df[df["standard_value"].notna()]
    df = df[df["standard_units"] == "nM"]
    df = df[df["standard_type"] == "IC50"]
    df = df[df["canonical_smiles"].notna()]  # Keep only rows with SMILES
    return df

df_cox1_clean = clean_and_filter(df_cox1)
df_cox2_clean = clean_and_filter(df_cox2)

# Step 3: Save cleaned data separately
df_cox1_clean.to_csv("cox1_clean_ic50.csv", index=False)
df_cox2_clean.to_csv("cox2_clean_ic50.csv", index=False)
print(f"Saved 'cox1_clean_ic50.csv' and 'cox2_clean_ic50.csv'")

# Step 4: Balance datasets by sampling
min_len = min(len(df_cox1_clean), len(df_cox2_clean))
df_cox1_bal = df_cox1_clean.sample(n=min_len, random_state=42).copy()
df_cox2_bal = df_cox2_clean.sample(n=min_len, random_state=42).copy()

# Step 5: Add binary labels
df_cox1_bal["label"] = 0
df_cox2_bal["label"] = 1

# Step 6: Merge into one labeled dataset
df_combined = pd.concat([df_cox1_bal, df_cox2_bal], ignore_index=True)
df_combined.to_csv("merged_balanced_cox1_2_ic50.csv", index=False)
print(f"Final dataset saved as 'merged_balanced_cox1_2_ic50.csv'")
print(f"Dataset contains {len(df_combined)} rows → {min_len} for each class.")


COX1: Retrieved 653 raw activities.
COX2: Retrieved 192 raw activities.
Saved 'cox1_clean_ic50.csv' and 'cox2_clean_ic50.csv'
Final dataset saved as 'merged_balanced_cox1_2_ic50.csv'
Dataset contains 374 rows → 187 for each class.


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# Load merged dataset
df = pd.read_csv("merged_balanced_cox1_2_ic50.csv")

# Function to generate Morgan fingerprints
def smiles_to_morgan(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [0] * nBits
    return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))

# Apply function to all SMILES
fingerprints = df["canonical_smiles"].apply(smiles_to_morgan)
fp_df = pd.DataFrame(fingerprints.tolist())

# Add labels and original SMILES for reference
fp_df["label"] = df["label"]
fp_df["canonical_smiles"] = df["canonical_smiles"]

# Save to CSV
fp_df.to_csv("ligand_features_morgan.csv", index=False)
print("Morgan fingerprints saved to 'ligand_features_morgan.csv'")

import pandas as pd

# Load FASTA sequences (manually pasted or read from file)
protein_seqs = {
    "COX1": "MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCTIPGLWTWLRNSLRPSPSFTHFLLTHGRWFWEFVNATFGLVPGQETLQHTSYNFTNGLYHFKGVDAQEQLSRVLAIVHPGAYEIATTHRLLREHVVRSDELLKPAVQ..."[:300],
    "COX2": "MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYRPNCTIPELYHYWPQKRQFQISAKVGDVIPVYEMELVPLAENRQEAMEKICLNPATVETTTKTVETTVEDTEETTSTVHFKNKTVVPTVPIAVQDTPEL..."[:300]
}

# Function to compute amino acid composition
def aa_composition(seq):
    aa_list = 'ACDEFGHIKLMNPQRSTVWY'
    freq = {aa: seq.count(aa)/len(seq) for aa in aa_list}
    return pd.Series(freq)

# Apply function
df_protein_features = pd.DataFrame({name: aa_composition(seq) for name, seq in protein_seqs.items()}).T
df_protein_features.to_csv("protein_features_aac.csv")
print("Amino acid composition saved to 'protein_features_aac.csv'") 




Morgan fingerprints saved to 'ligand_features_morgan.csv'
Amino acid composition saved to 'protein_features_aac.csv'


In [1]:
import pandas as pd

# Step 1: Load ligand and protein feature CSVs
ligand_df = pd.read_csv("ligand_features_morgan.csv")
protein_df = pd.read_csv("protein_features_aac.csv")

# Step 2: Map labels to protein names (0 → COX1, 1 → COX2)
label_to_protein = {0: "COX1", 1: "COX2"}
ligand_df["protein"] = ligand_df["label"].map(label_to_protein)

# Step 3: Merge ligand data with the matching protein features
final_df = ligand_df.merge(protein_df, on="protein", how="left")

# Step 4: Save final machine learning–ready dataset
final_df.to_csv("final_dataset_labeled.csv", index=False)

# Step 5: Print summary
print("Final dataset saved as 'final_dataset_labeled.csv'")
print(f"Rows: {len(final_df)} — Columns: {final_df.shape[1]}")


KeyError: 'protein'