In [1]:
import pandas as pd
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
df = pd.read_csv('pdb_ligand_chembl_ec.csv')

In [4]:
len(df)

72081

In [3]:
df.isna().sum()

pdb                      0
ligand                   0
protein_chain           75
resolution             491
R-factore             3052
release_date             0
Uniprot Accession      762
EC                   21942
ligand_chain             0
target_chembl_id     37483
ligand_chembl_id     25694
Ki                   65163
Kd                   64438
IC50                 61386
EC50                 70736
dtype: int64

In [41]:
df.head()

Unnamed: 0,pdb,ligand,protein_chain,resolution,R-factore,release_date,Uniprot Accession,EC,ligand_chain,target_chembl_id,ligand_chembl_id,Ki,Kd,IC50,EC50
0,3ddw,55,A,1.9,0.15436,2009/1/27,P06737,2.4.1.1,A,CHEMBL2568,,,,1070.0,
1,4wlk,3QL,A,2.03,0.1733,2015/7/8,O32108,,A,,,,,,
2,3cp8,FAD,A,3.2,0.25568,2008/6/24,Q8KA85,1.-.-.-,A,,CHEMBL1232653,,,,
3,6bmc,PEP,A,2.7,0.24961,2018/10/3,G3XCJ9,2.5.1.54,A,,CHEMBL1235228,,,,
4,4mm1,1GP,A,2.8,0.204,2014/6/25,O26652,2.5.1.41,A,,,,,,


In [42]:
df.keys()

Index(['pdb', 'ligand', 'protein_chain', 'resolution', 'R-factore',
       'release_date', 'Uniprot Accession', 'EC', 'ligand_chain',
       'target_chembl_id', 'ligand_chembl_id', 'Ki', 'Kd', 'IC50', 'EC50'],
      dtype='object')

In [43]:
df.drop(columns=['pdb', 'ligand', 'protein_chain', 'resolution', 'R-factore','release_date','EC', 'ligand_chain','target_chembl_id'], inplace=True)

In [44]:
df.head()

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Ki,Kd,IC50,EC50
0,P06737,,,,1070.0,
1,O32108,,,,,
2,Q8KA85,CHEMBL1232653,,,,
3,G3XCJ9,CHEMBL1235228,,,,
4,O26652,,,,,


In [45]:
df = df.dropna(subset=['ligand_chembl_id','Uniprot Accession']).reset_index(drop=True)

In [46]:
df.head()

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Ki,Kd,IC50,EC50
0,Q8KA85,CHEMBL1232653,,,,
1,G3XCJ9,CHEMBL1235228,,,,
2,P04392,CHEMBL418052,,,,
3,P35968,CHEMBL153843,,,48.4,
4,P05230,CHEMBL1233511,,,,


In [47]:
# Define the function to get SMILES from ChEMBL ID
def get_smile_from_chembl(chembl_id):
    if pd.isna(chembl_id):
        return None
    if not str(chembl_id).startswith('CHEMBL'):
        return None
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/{chembl_id}.json"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if data.get('molecule_structures'):
                return data['molecule_structures'].get('canonical_smiles')
    except:
        return None
    return None

# Function to fetch SMILES in parallel
def fetch_smiles_parallel(df, max_workers=10):
    df['SMILES'] = None  # Create empty column

    # Set up ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create a list of future requests (tasks)
        futures = {
            executor.submit(get_smile_from_chembl, row['ligand_chembl_id']): idx
            for idx, row in df.iterrows()
        }

        # Initialize tqdm progress bar
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching SMILES"):
            idx = futures[future]
            try:
                df.at[idx, 'SMILES'] = future.result()  # Get the result (SMILES)
            except Exception as e:
                df.at[idx, 'SMILES'] = None  # In case of error, leave as None

    return df

# Assuming you already have a dataframe 'df'
# Fetch the SMILES using parallel processing
df = fetch_smiles_parallel(df, max_workers=20)

# Show the result
print(df[['ligand_chembl_id', 'SMILES']].head(10))


Fetching SMILES: 100%|██████████| 45920/45920 [2:05:44<00:00,  6.09it/s]  

  ligand_chembl_id                                             SMILES
0    CHEMBL1232653  Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)...
1    CHEMBL1235228                              C=C(OP(=O)(O)O)C(=O)O
2     CHEMBL418052  Nc1ncnc2c1ncn2[C@@H]1O[C@H](CSCC[C@H](N)C(=O)O...
3     CHEMBL153843         O=C(Nc1cccc(C(F)(F)F)c1)c1ccccc1NCc1ccncc1
4    CHEMBL1233511  O=P(O)(O)O[C@H]1[C@H](OP(=O)(O)O)[C@@H](OP(=O)...
5    CHEMBL3309678  CC(=O)N(CCCC[C@H](NC(=O)N[C@@H](CCC(=O)O)C(=O)...
6    CHEMBL1230989  Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...
7    CHEMBL4533529  NC[C@H](NC(=O)c1ccc(-c2ccc(C(F)(F)F)c(F)c2)[nH...
8     CHEMBL284104                             O=C(O)c1cccc(C(=O)O)n1
9    CHEMBL1201384  Nc1ccn([C@H]2CC[C@@H](COP(=O)(O)OP(=O)(O)OP(=O...





In [48]:
df = df.dropna(subset=['SMILES']).reset_index(drop=True)

In [49]:
nan_count = df['SMILES'].isna().sum()
print(nan_count)


0


In [50]:
# Function to fetch FASTA from UniProt ID
def fetch_fasta(uniprot_id):
    if pd.isna(uniprot_id):
        return None
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            fasta = response.text
            # Remove header (first line) and join sequence lines
            sequence = ''.join(fasta.split('\n')[1:]).strip()
            return sequence
    except:
        return None
    return None

# Function to fetch all FASTA sequences in parallel
def fetch_fasta_parallel(df, id_column='Uniprot Accession', output_column='target_sequence', max_workers=20):
    df[output_column] = None  # Create empty column

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(fetch_fasta, row[id_column]): idx
            for idx, row in df.iterrows()
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching FASTA sequences"):
            idx = futures[future]
            try:
                df.at[idx, output_column] = future.result()
            except Exception as e:
                df.at[idx, output_column] = None

    return df

# Assuming you already have a dataframe 'df'
# Run parallel FASTA fetching
df = fetch_fasta_parallel(df, id_column='Uniprot Accession', output_column='target_sequence', max_workers=20)

# Show rows 0–50 (or first 5 rows if you prefer)
print(df[['Uniprot Accession', 'target_sequence']].head(5))


Fetching FASTA sequences: 100%|██████████| 45639/45639 [1:23:15<00:00,  9.14it/s]


  Uniprot Accession                                    target_sequence
0            Q8KA85  MYDVIVVGAGHAGCEAALAVARGGLHCLLITSDLSAVARMSCNPAI...
1            G3XCJ9  MDDLLQRVRRCEALQQPEWGDPSRLRDVQAYLRGSPALIRAGDILA...
2            P04392  MLGAIAYTGNKQSLLPELKSHFPKYNRFVDLFCGGLSVSLNVNGPV...
3            P35968  MQSKVLLAVALWLCVETRAASVGLPSVSLDLPRLSIQKDILTIKAN...
4            P05230  MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTV...


In [51]:
df = df.drop(columns=['EC50', 'Ki', 'IC50']).reset_index(drop=True)


In [52]:
df.head()

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Kd,SMILES,target_sequence
0,Q8KA85,CHEMBL1232653,,Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)...,MYDVIVVGAGHAGCEAALAVARGGLHCLLITSDLSAVARMSCNPAI...
1,G3XCJ9,CHEMBL1235228,,C=C(OP(=O)(O)O)C(=O)O,MDDLLQRVRRCEALQQPEWGDPSRLRDVQAYLRGSPALIRAGDILA...
2,P04392,CHEMBL418052,,Nc1ncnc2c1ncn2[C@@H]1O[C@H](CSCC[C@H](N)C(=O)O...,MLGAIAYTGNKQSLLPELKSHFPKYNRFVDLFCGGLSVSLNVNGPV...
3,P35968,CHEMBL153843,,O=C(Nc1cccc(C(F)(F)F)c1)c1ccccc1NCc1ccncc1,MQSKVLLAVALWLCVETRAASVGLPSVSLDLPRLSIQKDILTIKAN...
4,P05230,CHEMBL1233511,,O=P(O)(O)O[C@H]1[C@H](OP(=O)(O)O)[C@@H](OP(=O)...,MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTV...


In [53]:
df = df.dropna(subset=['Kd']).reset_index(drop=True)


In [54]:
print(len(df))


5784


In [55]:
df.head()

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Kd,SMILES,target_sequence
0,Q01782,CHEMBL34259,39.0,CN(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)N[C@@H](CC...,MTAPTVPVALVTGAAKRLGRSIAEGLHAEGYAVCLHYHRSAAEANA...
1,P08191,CHEMBL365590,20.0,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,MKRVITLFAVLLMGWSVNAWSFACKTANGTAIPIGGGSANVYVNLA...
2,Q9Y468,CHEMBL2426364,9400.0,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,MHLVAGDSPGSGPHLPATAFIIPASSATLGLPSSALDVSCFPREPI...
3,P03366,CHEMBL116,0.49,CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)O[C@H]...,MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAV...
4,P06493,CHEMBL1709089,1600.0,CN1CC[C@H](c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc...,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...


In [56]:
# Check for any NaN in each row
nan_rows = df[df.isnull().any(axis=1)]

# If you just want to know how many rows have NaN:
print(f"Number of rows with NaN: {len(nan_rows)}")


Number of rows with NaN: 1


In [57]:
# Drop rows with any NaN value
df = df.dropna()

# Reset the index after dropping
df = df.reset_index(drop=True)

# Show how many rows are left
print(f"Number of rows after dropping NaN: {len(df)}")


Number of rows after dropping NaN: 5783


In [58]:
# Find rows where Kd is exactly 0
kd_zero_rows = df[df['Kd'] == 0]

# How many rows have Kd = 0
print(f"Number of rows with Kd = 0: {len(kd_zero_rows)}")


Number of rows with Kd = 0: 8


In [59]:
# Drop rows where Kd is 0
df = df[df['Kd'] != 0].reset_index(drop=True)

print(f"Number of rows after dropping Kd = 0: {len(df)}")


Number of rows after dropping Kd = 0: 5775


In [60]:
# Group by SMILES and target_sequence, then count occurrences
duplicates_count = df.groupby(['SMILES', 'target_sequence']).size().reset_index(name='count')

# Filter to show only duplicates (count > 1)
duplicates = duplicates_count[duplicates_count['count'] > 1]

# Show the count of duplicates
print(f"Total number of duplicate SMILES and target_sequence pairs: {len(duplicates)}")


Total number of duplicate SMILES and target_sequence pairs: 615


In [61]:
# sort so that lowest Kd comes first in each group
df = df.sort_values(['SMILES', 'target_sequence', 'Kd'], ascending=[True, True, True])

# drop later duplicates, keeping the first (which has the lowest Kd)
df = df.drop_duplicates(subset=['SMILES', 'target_sequence'], keep='first').reset_index(drop=True)

df.head(50)

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Kd,SMILES,target_sequence
0,P16083,CHEMBL373937,18.1,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
1,Q07889,CHEMBL3414691,1300000.0,Brc1ccc(CN2CCCC2)cc1,MQAQQLPYEFFSEENAPKWRGLLVPALKKVQGQVHPTLESNDDALQ...
2,Q99814,CHEMBL3597698,65.3,Brc1cccc([C@@H]2C[C@H](c3cccc(Br)c3)n3nnnc3N2)c1,MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHS...
3,Q05397,CHEMBL2425111,95000.0,Brc1cncc(-c2nnn[nH]2)c1,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
4,Q13126,CHEMBL4465346,0.94,C#CCCCSC[C@H]1CN(Cc2c[nH]c3c(N)ncnc23)C[C@@H]1O,MASGTTTTAVKIGIIGGTGLDDPEILEGRTEKYVDTPFGKPSDALI...
5,Q9ZMY2,CHEMBL4465346,0.03,C#CCCCSC[C@H]1CN(Cc2c[nH]c3c(N)ncnc23)C[C@@H]1O,MQKIGILGAMREEITPILELFGVDFEEIPLGGNVFHKGVYHNKEII...
6,P07900,CHEMBL112953,64.0,C#CCCCn1c(Cc2cc(OC)c(OC)c(OC)c2Cl)nc2c(N)nc(F)...,MPEETQTQDQPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLR...
7,P27338,CHEMBL436947,600.0,C#CCN(C)[C@@H]1CCc2ccccc21,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...
8,P27338,CHEMBL371428,17000.0,C#CCN[C@@H]1CCc2ccc(O)cc21,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...
9,P27338,CHEMBL1235738,127000.0,C#CCN[C@H]1CCc2ccccc21,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...


In [62]:
len(df)

4601

In [63]:
df.head()

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Kd,SMILES,target_sequence
0,P16083,CHEMBL373937,18.1,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
1,Q07889,CHEMBL3414691,1300000.0,Brc1ccc(CN2CCCC2)cc1,MQAQQLPYEFFSEENAPKWRGLLVPALKKVQGQVHPTLESNDDALQ...
2,Q99814,CHEMBL3597698,65.3,Brc1cccc([C@@H]2C[C@H](c3cccc(Br)c3)n3nnnc3N2)c1,MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHS...
3,Q05397,CHEMBL2425111,95000.0,Brc1cncc(-c2nnn[nH]2)c1,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
4,Q13126,CHEMBL4465346,0.94,C#CCCCSC[C@H]1CN(Cc2c[nH]c3c(N)ncnc23)C[C@@H]1O,MASGTTTTAVKIGIIGGTGLDDPEILEGRTEKYVDTPFGKPSDALI...


In [64]:
df.isna().sum()

Uniprot Accession    0
ligand_chembl_id     0
Kd                   0
SMILES               0
target_sequence      0
dtype: int64

In [65]:
df.head()

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Kd,SMILES,target_sequence
0,P16083,CHEMBL373937,18.1,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
1,Q07889,CHEMBL3414691,1300000.0,Brc1ccc(CN2CCCC2)cc1,MQAQQLPYEFFSEENAPKWRGLLVPALKKVQGQVHPTLESNDDALQ...
2,Q99814,CHEMBL3597698,65.3,Brc1cccc([C@@H]2C[C@H](c3cccc(Br)c3)n3nnnc3N2)c1,MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHS...
3,Q05397,CHEMBL2425111,95000.0,Brc1cncc(-c2nnn[nH]2)c1,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
4,Q13126,CHEMBL4465346,0.94,C#CCCCSC[C@H]1CN(Cc2c[nH]c3c(N)ncnc23)C[C@@H]1O,MASGTTTTAVKIGIIGGTGLDDPEILEGRTEKYVDTPFGKPSDALI...


In [66]:
df = df.drop(columns=['SMILES_encoded_padded','target_sequence_encoded_padded']).reset_index(drop=True)

KeyError: "['SMILES_encoded_padded', 'target_sequence_encoded_padded'] not found in axis"

In [68]:
df.head()

Unnamed: 0,Uniprot Accession,ligand_chembl_id,Kd,SMILES,target_sequence
0,P16083,CHEMBL373937,18.1,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
1,Q07889,CHEMBL3414691,1300000.0,Brc1ccc(CN2CCCC2)cc1,MQAQQLPYEFFSEENAPKWRGLLVPALKKVQGQVHPTLESNDDALQ...
2,Q99814,CHEMBL3597698,65.3,Brc1cccc([C@@H]2C[C@H](c3cccc(Br)c3)n3nnnc3N2)c1,MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHS...
3,Q05397,CHEMBL2425111,95000.0,Brc1cncc(-c2nnn[nH]2)c1,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
4,Q13126,CHEMBL4465346,0.94,C#CCCCSC[C@H]1CN(Cc2c[nH]c3c(N)ncnc23)C[C@@H]1O,MASGTTTTAVKIGIIGGTGLDDPEILEGRTEKYVDTPFGKPSDALI...


In [69]:
df.to_csv('processed_data.csv', index=False)