In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
df1 = pd.read_csv('Kaggle/processed_data.csv')
df2 = pd.read_csv('PLDock/processed_data.csv')

In [3]:
df1.keys()


Index(['drug_id', 'target_id', 'smiles', 'origin_affinity', 'affinity',
       'target_sequence'],
      dtype='object')

In [4]:
df2.keys()

Index(['Uniprot Accession', 'ligand_chembl_id', 'Kd', 'SMILES',
       'target_sequence'],
      dtype='object')

In [5]:
df1 = df1.drop(columns=['drug_id','target_sequence','affinity']).reset_index(drop=True)

In [6]:
df2 = df2.drop(columns=['target_sequence','ligand_chembl_id',]).reset_index(drop=True)

In [7]:
df1.rename(columns={'origin_affinity' : 'Kd','smiles' : 'SMILES'}, inplace=True)

In [8]:
df1.keys()

Index(['target_id', 'SMILES', 'Kd'], dtype='object')

In [9]:
df2.keys()

Index(['Uniprot Accession', 'Kd', 'SMILES'], dtype='object')

In [10]:
df2.rename(columns={'Uniprot Accession' : 'target_id'}, inplace=True)

In [11]:
df1 = df1[['SMILES','target_id','Kd']]
df2 = df2[['SMILES','target_id','Kd']]

In [12]:
df1.head()

Unnamed: 0,SMILES,target_id,Kd
0,BrCCOc1ccc2nc3ccc(=O)cc3oc2c1,P05067,927.0
1,Br[Se]c1ccccc1,O75936,11000.0
2,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P30680,7943.0
3,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P28646,170.0
4,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,P16083,18.1


In [13]:
df2.head()

Unnamed: 0,SMILES,target_id,Kd
0,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,P16083,18.1
1,Brc1ccc(CN2CCCC2)cc1,Q07889,1300000.0
2,Brc1cccc([C@@H]2C[C@H](c3cccc(Br)c3)n3nnnc3N2)c1,Q99814,65.3
3,Brc1cncc(-c2nnn[nH]2)c1,Q05397,95000.0
4,C#CCCCSC[C@H]1CN(Cc2c[nH]c3c(N)ncnc23)C[C@@H]1O,Q13126,0.94


In [14]:
df = pd.concat([df1,df2], ignore_index=True)


In [15]:
df.head()

Unnamed: 0,SMILES,target_id,Kd
0,BrCCOc1ccc2nc3ccc(=O)cc3oc2c1,P05067,927.0
1,Br[Se]c1ccccc1,O75936,11000.0
2,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P30680,7943.0
3,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P28646,170.0
4,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,P16083,18.1


In [16]:
df.isna().sum()

SMILES       0
target_id    0
Kd           0
dtype: int64

In [17]:
df = df.dropna()

In [18]:
len(df)

61481

In [19]:
# sort so that lowest Kd comes first in each group
df = df.sort_values(['SMILES', 'target_id', 'Kd'], ascending=[True, True, True])

# drop later duplicates, keeping the first (which has the lowest Kd)
df = df.drop_duplicates(subset=['SMILES', 'target_id'], keep='first').reset_index(drop=True)

In [20]:
df.head()

Unnamed: 0,SMILES,target_id,Kd
0,BrCCOc1ccc2nc3ccc(=O)cc3oc2c1,P05067,927.0
1,Br[Se]c1ccccc1,O75936,11000.0
2,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P28646,170.0
3,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P30680,7943.0
4,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,P16083,18.1


In [21]:
len(df)

61264

In [22]:
import pandas as pd
import numpy as np

In [23]:
# Ensure no 0, negative, or NaN Kd values
df = df[df['Kd'].notna()]  # drop NaNs
df = df[df['Kd'] > 0]      # drop zero or negative values


In [24]:
df['Kd'] = 9 - np.log10(df['Kd'])

In [25]:
# Ensure no 0, negative, or NaN Kd values
df = df[df['Kd'].notna()]  # drop NaNs
df = df[df['Kd'] > 0]      # drop zero or negative values


In [26]:
# Check only Kd column
print("NaN in Kd:", df['Kd'].isna().any())
print("Inf in Kd:", np.isinf(df['Kd']).any())
print("Zero in Kd:", (df['Kd'] == 0).any())


NaN in Kd: False
Inf in Kd: False
Zero in Kd: False


In [None]:
df.to_csv('Target_ID/no_split.csv', index=False)

In [None]:
#split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
len(train_df), len(test_df)

(55130, 6126)

In [None]:
train_df.to_csv('Target_ID/Kd.csv', index=False)
test_df.to_csv('Target_ID/test_dataset.csv', index=False)