In [1]:
import pandas as pd
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
df = pd.read_csv('Kd_bind.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,drug_id,target_id,smiles,target_seq,origin_affinity,affinity
0,10,Q13451,O=C1CCCCN1,MTTDEGAKNNEESPTATVAEQGEDITSKKDRGVLKIVKRVGNGEET...,2800000.0,2.552842
1,100205,P13569,COc1ccc(Cl)cc1Nc1nc(cs1)-c1sc(NC(=O)C(C)(C)C)nc1C,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,700.0,6.154902
2,100279,P0A6F1,O[C@@H]1[C@@H](COP(O)(O)=O)O[C@H]([C@@H]1O)n1c...,MIKSALLVLEDGTQFHGRAIGATGSAVGEVVFNTSMTGYQEILTDP...,33000.0,4.481486
3,100283,P0A6D3,O[C@@H]1CC(=C[C@@H](OP(O)(O)=O)[C@H]1O)C(O)=O,MESLTLQPIARVDGTINLPGSKSVSNRALLLAALAHGKTVLTNLLD...,7000.0,5.154902
4,100348,O60674,COCC(=O)N1CCN(CC1)c1ccc(Nc2ncc(C(N)=O)c(NC3CC3...,MGMACLTMTEMEGTSTSSIYQNGDISGNANSMKQIDPVLQVYLYHS...,5.1,8.29243


In [4]:
df.keys()

Index(['drug_id', 'target_id', 'smiles', 'target_seq', 'origin_affinity',
       'affinity'],
      dtype='object')

In [5]:
df.drop(columns=['target_seq'], inplace=True)

In [6]:
df.head()

Unnamed: 0,drug_id,target_id,smiles,origin_affinity,affinity
0,10,Q13451,O=C1CCCCN1,2800000.0,2.552842
1,100205,P13569,COc1ccc(Cl)cc1Nc1nc(cs1)-c1sc(NC(=O)C(C)(C)C)nc1C,700.0,6.154902
2,100279,P0A6F1,O[C@@H]1[C@@H](COP(O)(O)=O)O[C@H]([C@@H]1O)n1c...,33000.0,4.481486
3,100283,P0A6D3,O[C@@H]1CC(=C[C@@H](OP(O)(O)=O)[C@H]1O)C(O)=O,7000.0,5.154902
4,100348,O60674,COCC(=O)N1CCN(CC1)c1ccc(Nc2ncc(C(N)=O)c(NC3CC3...,5.1,8.29243


In [7]:
# Function to fetch FASTA from UniProt ID
def fetch_fasta(uniprot_id):
    if pd.isna(uniprot_id):
        return None
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            fasta = response.text
            # Remove header (first line) and join sequence lines
            sequence = ''.join(fasta.split('\n')[1:]).strip()
            return sequence
    except:
        return None
    return None

# Function to fetch all FASTA sequences in parallel
def fetch_fasta_parallel(df, id_column='target_id', output_column='target_sequence', max_workers=20):
    df[output_column] = None  # Create empty column

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(fetch_fasta, row[id_column]): idx
            for idx, row in df.iterrows()
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching FASTA sequences"):
            idx = futures[future]
            try:
                df.at[idx, output_column] = future.result()
            except Exception as e:
                df.at[idx, output_column] = None

    return df

# Assuming you already have a dataframe 'df'
# Run parallel FASTA fetching
df = fetch_fasta_parallel(df, id_column='target_id', output_column='target_sequence', max_workers=20)

# Show rows 0–50 (or first 5 rows if you prefer)
print(df[['target_id', 'target_sequence']].head(5))


Fetching FASTA sequences: 100%|██████████| 57002/57002 [2:36:22<00:00,  6.08it/s]  


  target_id                                    target_sequence
0    Q13451  MTTDEGAKNNEESPTATVAEQGEDITSKKDRGVLKIVKRVGNGEET...
1    P13569  MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...
2    P0A6F1  MIKSALLVLEDGTQFHGRAIGATGSAVGEVVFNTSMTGYQEILTDP...
3    P0A6D3  MESLTLQPIARVDGTINLPGSKSVSNRALLLAALAHGKTVLTNLLD...
4    O60674  MGMACLTMTEMEGTSTSSIYQNGDISGNANSMKQIDPVLQVYLYHS...


In [8]:
len(df)

57002

In [9]:
df.isna().sum()

drug_id             0
target_id           0
smiles              0
origin_affinity     0
affinity            0
target_sequence    84
dtype: int64

In [10]:
df = df.dropna()

In [11]:
print(len(df))


56918


In [12]:
# Check for any NaN in each row
nan_rows = df[df.isnull().any(axis=1)]

# If you just want to know how many rows have NaN:
print(f"Number of rows with NaN: {len(nan_rows)}")


Number of rows with NaN: 0


In [13]:
df.head()

Unnamed: 0,drug_id,target_id,smiles,origin_affinity,affinity,target_sequence
0,10,Q13451,O=C1CCCCN1,2800000.0,2.552842,MTTDEGAKNNEESPTATVAEQGEDITSKKDRGVLKIVKRVGNGEET...
1,100205,P13569,COc1ccc(Cl)cc1Nc1nc(cs1)-c1sc(NC(=O)C(C)(C)C)nc1C,700.0,6.154902,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...
2,100279,P0A6F1,O[C@@H]1[C@@H](COP(O)(O)=O)O[C@H]([C@@H]1O)n1c...,33000.0,4.481486,MIKSALLVLEDGTQFHGRAIGATGSAVGEVVFNTSMTGYQEILTDP...
3,100283,P0A6D3,O[C@@H]1CC(=C[C@@H](OP(O)(O)=O)[C@H]1O)C(O)=O,7000.0,5.154902,MESLTLQPIARVDGTINLPGSKSVSNRALLLAALAHGKTVLTNLLD...
4,100348,O60674,COCC(=O)N1CCN(CC1)c1ccc(Nc2ncc(C(N)=O)c(NC3CC3...,5.1,8.29243,MGMACLTMTEMEGTSTSSIYQNGDISGNANSMKQIDPVLQVYLYHS...


In [14]:
# Group by SMILES and target_sequence, then count occurrences
duplicates_count = df.groupby(['smiles', 'target_sequence']).size().reset_index(name='count')

# Filter to show only duplicates (count > 1)
duplicates = duplicates_count[duplicates_count['count'] > 1]

# Show the count of duplicates
print(f"Total number of duplicate SMILES and target_sequence pairs: {len(duplicates)}")


Total number of duplicate SMILES and target_sequence pairs: 36


In [16]:
# sort so that lowest Kd comes first in each group
df = df.sort_values(['smiles', 'target_sequence', 'origin_affinity'], ascending=[True, True, True])

# drop later duplicates, keeping the first (which has the lowest Kd)
df = df.drop_duplicates(subset=['smiles', 'target_sequence'], keep='first').reset_index(drop=True)

df.head(50)

Unnamed: 0,drug_id,target_id,smiles,origin_affinity,affinity,target_sequence
0,195595,P05067,BrCCOc1ccc2nc3ccc(=O)cc3oc2c1,927,6.03292,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...
1,50028971,O75936,Br[Se]c1ccccc1,11000,4.958607,MACTIQKAEALDGAHLMQILWYDEEESLYPAVWLRDNCPCSDCYLD...
2,50423469,P30680,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,7943,5.100015,MELTSEQFNGSQVWIPSPFDLNGSLGPSNGSNQTEPYYDMTSNAVL...
3,50423469,P28646,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,170,6.769551,MFPNGTAPSPTSSPSSSPGGCGEGVCSRGPGSGAADGMEEPGRNSS...
4,50156669,P16083,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,18.1,7.742321,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
5,11323,P16083,Brc1c(Br)c(Br)c2[nH]nnc2c1Br,7.11e+3,5.14813,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
6,50514753,P14902,Brc1c[nH]c(c1)C(=O)NCc1cccc(Cn2cnc3ccccc23)c1,550,6.259637,MAHAMENSWTISKEYHIDEEVGFALPNPQENLPDFYNDWMFIAKHL...
7,50127014,P28482,Brc1c[nH]c2nccnc12,97000,4.013228,MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNV...
8,50490553,P00720,Brc1c[nH]cn1,317000,3.498941,MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
9,50064076,Q2NNR5,Brc1cc(C(=O)NS(=O)(=O)c2ccccc2)c2oc(cc(=O)c2c1...,382,6.417937,MDETGNPTIPPASNNTCYDSIDDFRNQVYSTLYSMISVVGFFGNGF...


In [17]:
len(df)

56880

In [18]:
df.isna().sum()

drug_id            0
target_id          0
smiles             0
origin_affinity    0
affinity           0
target_sequence    0
dtype: int64

In [19]:
df.head()

Unnamed: 0,drug_id,target_id,smiles,origin_affinity,affinity,target_sequence
0,195595,P05067,BrCCOc1ccc2nc3ccc(=O)cc3oc2c1,927.0,6.03292,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...
1,50028971,O75936,Br[Se]c1ccccc1,11000.0,4.958607,MACTIQKAEALDGAHLMQILWYDEEESLYPAVWLRDNCPCSDCYLD...
2,50423469,P30680,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,7943.0,5.100015,MELTSEQFNGSQVWIPSPFDLNGSLGPSNGSNQTEPYYDMTSNAVL...
3,50423469,P28646,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,170.0,6.769551,MFPNGTAPSPTSSPSSSPGGCGEGVCSRGPGSGAADGMEEPGRNSS...
4,50156669,P16083,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,18.1,7.742321,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...


In [20]:
df = df.drop(columns=['target_sequence_encoded_padded','SMILES_encoded_padded']).reset_index(drop=True)

KeyError: "['target_sequence_encoded_padded', 'SMILES_encoded_padded'] not found in axis"

In [21]:
df['origin_affinity'] = df['origin_affinity'].str.replace('>', '', regex=False)
df['origin_affinity'] = df['origin_affinity'].str.replace('<', '', regex=False)
# Step 2: Make sure the 'affinity' column is float
df['origin_affinity'] = df['origin_affinity'].astype(float)

In [22]:
df = df.dropna()

In [23]:
df.isna().sum()

drug_id            0
target_id          0
smiles             0
origin_affinity    0
affinity           0
target_sequence    0
dtype: int64

In [24]:
df.sample()

Unnamed: 0,drug_id,target_id,smiles,origin_affinity,affinity,target_sequence
15467,13534,Q13163,CN1CCN(CC1)c1cc(Nc2cc(C)n[nH]2)nc(Sc2ccc(NC(=O...,260.0,6.585027,MLWLALGPFPAMENQVLVIRIKIPNSGAVDWTVHSGPQLLFRDVLD...


In [25]:
df.to_csv('processed_data.csv', index=False)