In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


<h2><b> The data preprocessing creates the training and testing sets from binding affinity data from DeepAffinity. </b></h2>


*   Combine multiple dataframes and drop irrelevant columns
*   Convert pKd to Kd (binding affinity measure) labels
*   Convert Uniprot ID to organism name and species
*   Convert DeepAffinity Protein ID to sequence of amino acids



---

In [None]:
import pandas as pd

# Read in tab separated Deep Affinity Lab files
protein_compound_pair = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/Kd_protein_compound_pair.tsv', sep='\t')
smiles = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/dcid_smi.tsv', sep='\t')

## Convert pKd to Kd

In [None]:
# Drop null values
df1 = protein_compound_pair.dropna()
df2 = smiles.dropna()

# Remove unnecessary columns
df1 = df1.drop(columns=['CID'])

# Add Kd column by converting from pKd
df1['Kd'] = df1['pKd_[M]'].apply(lambda x: 10**(-x))
print(df1.describe())

# Display Dataframe 1
df1.head(5)

            pKd_[M]            Kd
count  17819.000000  1.781900e+04
mean       6.446997  1.090912e-04
std        1.573819  7.613733e-04
min        2.000000  1.000000e-11
25%        5.460000  3.100000e-08
50%        6.400117  3.980000e-07
75%        7.508638  3.467370e-06
max       11.000000  1.000000e-02


Unnamed: 0,DeepAffinity Protein ID,Uniprot ID,DeepAffinity Compound ID,pKd_[M],Kd
22,P6FH,P00918,s02x,9.337242,4.6e-10
23,P6FH,P00918,52gg,9.309804,4.9e-10
24,P6FH,P00918,gxar,9.080922,8.3e-10
25,P6FH,P00918,37o6,9.69897,2e-10
26,P6FH,P00918,5bx7,9.79588,1.6e-10


In [None]:
# Display DataFrame 2
print(df2.describe())
df2.head(5)

       DeepAffinity Compound ID  \
count                    598206   
unique                   598206   
top                        y74g   
freq                          1   

                                          Canonical SMILE  
count                                              598206  
unique                                             570009  
top     CCC(C)C(C(=O)NC(CC(C)C)C(=O)NC(CC(=O)N)C(=O)N)...  
freq                                                   28  


Unnamed: 0,DeepAffinity Compound ID,Canonical SMILE
0,y74g,CN1CCC(CC1)COC2=C(C=C3C(=C2)N=CN=C3NC4=C(C=C(C...
1,nkqn,C1=CC=C(C=C1)CC2C(C(C(N(C(=O)N2CC=CC3=CNN=C3)C...
2,1ipu,C1CC1CN2C(C(C(C(N(C2=O)CC=CC3=CNN=C3)CC4=CC=CC...
3,dapw,C1CC1CN2C(C(C(C(N(C2=O)CCCCCCO)CC3=CC=CC=C3)O)...
4,neuu,C1CC1CN2C(C(C(C(N(C2=O)CCCCCO)CC3=CC=CC=C3)O)O...


## Obtain first merged dataframe containing smiles and protein id

In [None]:
# Merge both dataframes in place with deepaffinity compound ID as primary key
# Obtained 17819 rows mapping SMILES and Protein ID to Kd values
df3 = df1.merge(smiles, on='DeepAffinity Compound ID', how='inner')
df3 = df3.drop('DeepAffinity Compound ID', axis=1)
print(df3.describe())
df3.head(5)


            pKd_[M]            Kd
count  17819.000000  1.781900e+04
mean       6.446997  1.090912e-04
std        1.573819  7.613733e-04
min        2.000000  1.000000e-11
25%        5.460000  3.100000e-08
50%        6.400117  3.980000e-07
75%        7.508638  3.467370e-06
max       11.000000  1.000000e-02


Unnamed: 0,DeepAffinity Protein ID,Uniprot ID,pKd_[M],Kd,Canonical SMILE
0,P6FH,P00918,9.337242,4.6e-10,CC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N
1,P6FH,P00918,9.309804,4.9e-10,COC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N
2,P6FH,P00918,9.080922,8.3e-10,C1=CSC(=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N
3,P6FH,P00918,9.69897,2e-10,C1C(C2=C(SC(=C2)S(=O)(=O)N)S(=O)(=O)N1CC3=CC=C...
4,P6FH,P00918,9.79588,1.6e-10,COC1=CC=C(C=C1)N2CC(C3=C(S2(=O)=O)SC(=C3)S(=O)...


## Obtain Kd labels (low, medium, high)

In [None]:
# Define a function to label Kd values
def label_kd(kd_value):
    if kd_value <= 1e-9:
        return 'High'
    elif kd_value <= 1e-6:
        return 'Medium'
    else:
        return 'Low'

# Add a new column 'Affinity Label' based on Kd values
df3['Label'] = df3['Kd'].apply(label_kd)

# Display the DataFrame with the new column
print(df3.head(20))

# Count the number of occurrences of each category in the 'Affinity Label' column
affinity_counts = df3['Label'].value_counts()

# Display the counts
print(affinity_counts)

   DeepAffinity Protein ID Uniprot ID    pKd_[M]            Kd  \
0                     P6FH     P00918   9.337242  4.600000e-10   
1                     P6FH     P00918   9.309804  4.900000e-10   
2                     P6FH     P00918   9.080922  8.300000e-10   
3                     P6FH     P00918   9.698970  2.000000e-10   
4                     P6FH     P00918   9.795880  1.600000e-10   
5                     P6FH     P00918   9.494850  3.200000e-10   
6                     P6FH     P00918  10.000000  1.000000e-10   
7                     P6FH     P00918  10.000000  1.000000e-10   
8                     P6FH     P00918   8.769551  1.700000e-09   
9                     P6FH     P00918   9.886057  1.300000e-10   
10                    BJMK     P00915   6.698970  2.000000e-07   
11                    P6FH     P00918   6.628932  2.350000e-07   
12                    BJMK     P00915   7.119186  7.600000e-08   
13                    P6FH     P00918   7.096910  8.000000e-08   
14        

## Create textual description of each row for fine-tuning

In [None]:
# Generate textual description of every row in dataframe
df3['Textual Description'] = df3.apply(lambda row: f"Compound \
with SMILES sequence of {row['Canonical SMILE']} binds to Protein {row['DeepAffinity Protein ID']} with {row['Label']} binding affinity.", axis=1)
df3.head(5)

Unnamed: 0,DeepAffinity Protein ID,Uniprot ID,pKd_[M],Kd,Canonical SMILE,Label,Textual Description
0,P6FH,P00918,9.337242,4.6e-10,CC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of CC1=CC=C(C=C1...
1,P6FH,P00918,9.309804,4.9e-10,COC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of COC1=CC=C(C=C...
2,P6FH,P00918,9.080922,8.3e-10,C1=CSC(=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of C1=CSC(=C1)CN...
3,P6FH,P00918,9.69897,2e-10,C1C(C2=C(SC(=C2)S(=O)(=O)N)S(=O)(=O)N1CC3=CC=C...,High,Compound with SMILES sequence of C1C(C2=C(SC(=...
4,P6FH,P00918,9.79588,1.6e-10,COC1=CC=C(C=C1)N2CC(C3=C(S2(=O)=O)SC(=C3)S(=O)...,High,Compound with SMILES sequence of COC1=CC=C(C=C...


## Convert Uniprot ID to name and sequence

In [None]:
# Extract Uniprot ID for conversion in Uniprot
df3['Uniprot ID'] = df3['Uniprot ID'].drop_duplicates()
df3['Uniprot ID'].to_csv('uniprot.csv', index=False, header=True)

In [None]:
# Process ID Mapping Data obtained from UniProt
id_map = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/id_mapping.tsv', sep='\t')
id_map.rename(columns={'From': 'Uniprot ID'}, inplace=True)
id_map.drop(columns=['Entry'], inplace=True)
id_map.head(5)

Unnamed: 0,Uniprot ID,Protein names,Organism
0,P00918,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human)
1,P00915,Carbonic anhydrase 1 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human)
2,P47811,Mitogen-activated protein kinase 14 (MAP kinas...,Mus musculus (Mouse)
3,Q16539,Mitogen-activated protein kinase 14 (MAP kinas...,Homo sapiens (Human)
4,O43318,Mitogen-activated protein kinase kinase kinase...,Homo sapiens (Human)


In [None]:
# Convert Uniprot ID into Protein name and organism (species)
df_before_merge = df3
merged_df = pd.merge(df3, id_map, on='Uniprot ID', how='left')
merged_df.head(5)

Unnamed: 0,DeepAffinity Protein ID,Uniprot ID,pKd_[M],Kd,Canonical SMILE,Label,Textual Description,Protein names,Organism
0,P6FH,P00918,9.337242,4.6e-10,CC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of CC1=CC=C(C=C1...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human)
1,P6FH,P00918,9.309804,4.9e-10,COC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of COC1=CC=C(C=C...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human)
2,P6FH,P00918,9.080922,8.3e-10,C1=CSC(=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of C1=CSC(=C1)CN...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human)
3,P6FH,P00918,9.69897,2e-10,C1C(C2=C(SC(=C2)S(=O)(=O)N)S(=O)(=O)N1CC3=CC=C...,High,Compound with SMILES sequence of C1C(C2=C(SC(=...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human)
4,P6FH,P00918,9.79588,1.6e-10,COC1=CC=C(C=C1)N2CC(C3=C(S2(=O)=O)SC(=C3)S(=O)...,High,Compound with SMILES sequence of COC1=CC=C(C=C...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human)


# Map Protein ID to Protein Sequence

In [None]:
# Import file to map ID to sequence and export merged_sequence

id_to_seq = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/dpid_seq.tsv', sep='\t')

merged_df_2 = pd.merge(merged_df, id_to_seq, left_on='DeepAffinity Protein ID', right_on='DeepAffinity Protein ID', how='inner')

merged_df_2.to_csv('merged_sequence.csv', index=False)

## Perform Train Test Split (70% Train, 30% Test)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and the target variable (y)
X = merged_df_2.drop(columns=['Label', 'pKd_[M]', 'Kd'])
y = merged_df_2['Label']

# Perform 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
print("\n")
print("X")
X.head(5)


X_train shape: (12478, 7)
y_train shape: (12478,)
X_test shape: (5348, 7)
y_test shape: (5348,)


X


Unnamed: 0,DeepAffinity Protein ID,Uniprot ID,Canonical SMILE,Textual Description,Protein names,Organism,Sequence
0,P6FH,P00918,CC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,Compound with SMILES sequence of CC1=CC=C(C=C1...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human),MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
1,P6FH,P00918,COC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,Compound with SMILES sequence of COC1=CC=C(C=C...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human),MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
2,P6FH,P00918,C1=CSC(=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,Compound with SMILES sequence of C1=CSC(=C1)CN...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human),MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
3,P6FH,P00918,C1C(C2=C(SC(=C2)S(=O)(=O)N)S(=O)(=O)N1CC3=CC=C...,Compound with SMILES sequence of C1C(C2=C(SC(=...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human),MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
4,P6FH,P00918,COC1=CC=C(C=C1)N2CC(C3=C(S2(=O)=O)SC(=C3)S(=O)...,Compound with SMILES sequence of COC1=CC=C(C=C...,Carbonic anhydrase 2 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human),MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...


## Write to Train and Test files for other subsystems

In [None]:
# Save train and test data to separate CSV files
train_data.to_csv('binding_affinity_train.csv', index=False, header=True)
test_data.to_csv('binding_affinity_test.csv', index=False, header=True)