In [13]:
import pandas as pd
import numpy as np
import os

# Set working directory
os.chdir("/Users/sandragarcia/Documents/HUMBER/Clinical Bioinformatics/2nd Semester/Capstone Project/")

# Load the CSVs using automatic separator detection
df1 = pd.read_csv("cox1_data.csv", sep=None, engine="python") # autodetect separator like comma or tab.
df2 = pd.read_csv("cox2_data.csv", sep=None, engine="python")

# Add target labels
# New column called target to identify if each row corresponds to COX1 or COX2.
df1["target"] = "COX1"
df2["target"] = "COX2"

# Combine datasets
df = pd.concat([df1, df2], ignore_index=True) # Restart the numbering of the rows.

# Print columns to verify
# Print the list of columns of the combined dataframe, so we can see what they are actually called.
print("Columns in merged dataframe:", df.columns.tolist())

# Rename columns to simplify
df.rename(columns={
    "Smiles": "smiles",
    "Standard Type": "standard_type",
    "Standard Relation": "standard_relation",
    "Standard Value": "standard_value",
    "Assay Type": "assay_type"
}, inplace=True)

# Check how many rows match each filtering condition
# This helps us understand which filter is eliminating too many rows
print("Total rows before filtering:", len(df))
print("Rows with standard_type as IC50 or Ki:", len(df[df["standard_type"].isin(["IC50", "Ki"])]))
print("Rows with assay_type as 'B':", len(df[df["assay_type"] == "B"]))
print("Rows with non-null standard_value:", len(df[df["standard_value"].notnull()]))
print("Rows with non-null smiles:", len(df[df["smiles"].notnull()]))
print("Unique values in standard_relation:", df["standard_relation"].unique())

# Apply filters without relation condition
df = df[
    (df["standard_type"].isin(["IC50", "Ki"])) &
    (df["assay_type"] == "B") &
    (df["standard_value"].notnull()) &
    (df["smiles"].notnull())
]

# Remove invalid values before applying log10
df = df[df["standard_value"] > 0]

print("Rows remaining after all filters:", len(df))

# Convert to pChEMBL
# This conversion makes it easy to compare compounds with each other and use those values as a target variable (y) in a regression model.
df["pChEMBL"] = -np.log10(df["standard_value"] * 1e-9)

# Keep relevant columns
df_clean = df[["smiles", "target", "standard_type", "standard_value", "pChEMBL"]]
df_clean = df_clean.drop_duplicates()

# Save cleaned data
df_clean.to_csv("cox_cleaned_bioactivity.csv", index=False)

print("Clean dataset saved as 'cox_cleaned_bioactivity.csv'")


Columns in merged dataframe: ['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase', 'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment', 'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE', 'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID', 'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID', 'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction', 'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation', 'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type', 'Document ChEMBL ID', 'Source ID', 'Source Description', 'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties', 'Action Type', 'Standard Text Value', 'Value', 'target']
Total rows before filtering: 23449
Rows with standa

In [15]:
import requests

def get_chembl_id(query_name, resource_type):
    base_url = "https://www.ebi.ac.uk/chembl/api/data/" + resource_type + "/search"
    params = {'q': query_name, 'limit': 1,
              'format': 'json', 'organism': 'Homo sapiens'}
    response = requests.get(base_url, params=params)
    data = response.json()
    if resource_type == "molecule":
        return data["molecules"][0]["molecule_chembl_id"]
    return None

def get_smiles(chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/{chembl_id}.json"
    response = requests.get(url)
    data = response.json()
    return data["molecule_structures"]["canonical_smiles"]

# Example: get smiles for paracetamol
chembl_id = get_chembl_id("paracetamol", "molecule")   # Should return CHEMBL112
paracetamol_smiles = get_smiles(chembl_id)             # Returns canonical SMILES
print("Paracetamol SMILES:", paracetamol_smiles)


Paracetamol SMILES: CC(=O)Nc1ccc(O)cc1
