# Creating Feature Vectors for Drugs Target Pairs

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer

## Protein Descriptors

In [2]:
# reading in FASTA files containing the Amino Acid sequences
def read_fasta(file_names):
    """
    Reads a FASTA files and returns a pd DataFrame with the protein names and Amino Acid sequences
    # Params
    file_names: list of fasta file names to read
    
    # Returns
    df: the DataFrame containing the names and sequences from all inputted files
    """
    df = pd.DataFrame()
    names = []
    sequences = []
    ids = []
    db_id = []
    for file_name in file_names:
        seq = ""
        with open(file_name, "r") as f:
            for line in f:
                # if its reading a comment line extract the name of the target
                if line[0] == '>':
                    sequences.append(seq)
                    seq = ""
                    names.append(' '.join(str(v) for v in line.split(' (D')[0].split()[2:]).lower())
                    ids.append(line.split()[1])
                # if its not on a comment line extract the sequence
                else:
                    seq += line
            # adding the last sequence
            sequences.append(seq)
            # removing empty strings added to list
            sequences.remove("")
    df['Name'] = names
    df['uniprot id'] = ids
    df['sequence'] = sequences
    return df

In [3]:
# reading all domain info files
domain_df = pd.read_csv('./drug_target_data/protein_domains/drugbank_target_domains1.tsv', sep='\t')
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_target_domains2.tsv', sep='\t'), ignore_index=True)
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_transporter_domains.tsv', sep='\t'), ignore_index=True)
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_enzyme_domains.tsv', sep='\t'), ignore_index=True)
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_carrier_domains.tsv', sep='\t'), ignore_index=True)

domain_df.rename(columns={'Accession':'domains'}, inplace=True)

# removing all info except uniprot id from first column
domain_df = domain_df.rename(columns={'Query':'UniProt ID'})
domain_df['UniProt ID'] = domain_df['UniProt ID'].apply(lambda x: x.split()[3])

domain_df.head()

Unnamed: 0,UniProt ID,Hit type,PSSM-ID,From,To,E-Value,Bitscore,domains,Short name,Incomplete,Superfamily
0,P45059,non-specific,185060,25,584,0.0,688.094,PRK15105,PRK15105,-,cl33083
1,P45059,superfamily,185060,25,584,0.0,688.094,cl33083,PRK15105 superfamily,-,-
2,P45059,specific,223839,28,585,9.30512e-168,490.767,COG0768,FtsI,-,cl34037
3,P45059,superfamily,223839,28,585,9.30512e-168,490.767,cl34037,FtsI superfamily,-,-
4,P45059,non-specific,131269,44,588,4.29878e-110,343.718,TIGR02214,spoVD_pbp,-,cl31183


In [4]:
# reading the fasta files with the target amino acid sequences
seq_df = read_fasta(['./drug_target_data/target_structures/targets.fasta', './drug_target_data/target_structures/enzymes.fasta', './drug_target_data/target_structures/carriers.fasta', './drug_target_data/target_structures/transporters.fasta'])
seq_df.drop_duplicates(inplace=True)
seq_df.sequence = seq_df.sequence.apply(lambda s: s.replace('\n', ''))
seq_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5177 entries, 0 to 6139
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5177 non-null   object
 1   uniprot id  5177 non-null   object
 2   sequence    5177 non-null   object
dtypes: object(3)
memory usage: 161.8+ KB


In [6]:
# removing proteins that are not in the domain dataframe; these are proteins that we don't have domain data for
seq_df = seq_df.loc[seq_df['uniprot id'].isin(domain_df['UniProt ID'])]
# removing duplicates
seq_df.drop_duplicates(inplace=True)

seq_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5153 entries, 0 to 6139
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5153 non-null   object
 1   uniprot id  5153 non-null   object
 2   sequence    5153 non-null   object
dtypes: object(3)
memory usage: 161.0+ KB


In [7]:
seq_df.head()

Unnamed: 0,Name,uniprot id,sequence
0,peptidoglycan synthase ftsi,P45059,MVKFNSSRKSGKSKKTIRKLTAPETVKQNKPQKVFEKCFMRGRYML...
1,histidine decarboxylase,P19113,MMEPEEYRERGREMVDYICQYLSTVRERRVTPDVQPGYLRAQLPES...
2,"glutaminase liver isoform, mitochondrial",Q9UI32,MRSMKALQKALSRAGSHCGRGGWGHPSRSPLLGGGVRHHLSEAAAQ...
3,coagulation factor xiii a chain,P00488,MSETSRTAFGGRRAVPPNNSNAAEDDLPTVELQGVVPRGVNLQEFL...
4,"nitric oxide synthase, inducible",P35228,MACPWKFLFKTKFHQYAMNGEKDINNNVEKAPCATSSPVTQDDLQY...


In [8]:
num_proteins = seq_df.shape[0]
num_domains = len(set(domain_df['domains']))
proteins = list(seq_df['uniprot id'])

### Domain Features

In [78]:
protein_domains = dict()

# interating through data and adding all domains of a single protein to the dict
current = domain_df['UniProt ID'][0]
protein_domains[current] = []
for i, r in tqdm(domain_df.iterrows()):
    # if the current protein is not the same as the previous ones
    if current != r['UniProt ID']:
        # only keeping unique domains for each protein
        protein_domains[current] = list(set(protein_domains[current]))
        current = r['UniProt ID']
        protein_domains[current] = []
    protein_domains[current].append(r['domains'])
# keeping the unique domains for the last protein bc it isn't handled in the loop
protein_domains[current] = list(set(protein_domains[current]))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [128]:
# creating adjacency matrix for protein domains
adjacency_matrix = pd.DataFrame(np.zeros((num_proteins, num_proteins), dtype=np.int64), columns=list(set(domain_df['UniProt ID'])))
adjacency_matrix['UniProt ID'] = list(set(domain_df['UniProt ID']))
adjacency_matrix.set_index('UniProt ID', inplace=True)
adjacency_matrix.head()

Unnamed: 0_level_0,Q8II92,P08337,O70038,P05091,P07024,Q8WWQ8,P32396,Q9BX79,P31153,P16070,...,P11233,P95780,Q07817,P14090,Q9RHZ6,Q9NNW7,P0AEG4,P04070,P50914,Q9BY07
UniProt ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q8II92,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P08337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O70038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P05091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P07024,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
adjacency_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5153 entries, Q8II92 to Q9BY07
Columns: 5153 entries, Q8II92 to Q9BY07
dtypes: int64(5153)
memory usage: 202.6+ MB


In [130]:
# creating adjacency matrix
for k, v in tqdm(protein_domains.items()):
    prot = []
    for key, value in protein_domains.items():
        if len(set(v).intersection(set(value))) > 0:
            prot.append(key)
    for p in prot:
        adjacency_matrix.loc[k, p] = 1

HBox(children=(FloatProgress(value=0.0, max=5153.0), HTML(value='')))




In [143]:
adjacency_matrix

Unnamed: 0_level_0,Q8II92,P08337,O70038,P05091,P07024,Q8WWQ8,P32396,Q9BX79,P31153,P16070,...,P11233,P95780,Q07817,P14090,Q9RHZ6,Q9NNW7,P0AEG4,P04070,P50914,Q9BY07
UniProt ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q8II92,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P08337,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O70038,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P05091,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P07024,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9NNW7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
P0AEG4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
P04070,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
P50914,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [55]:
adjacency_matrix.to_csv('./drug_target_data/protein_adjacency_matrix.csv')

## Drug Descriptors

In [5]:
def read_smi(file_path):
    smi = []
    with open(file_path, "r") as f:
        for line in f:
            smi.append(line.replace('\n', '').replace('\t', ''))
    return pd.DataFrame(data={"SMILES":smi})

In [2]:
descriptors_df = pd.read_csv('./drug_target_data/drug_descriptors/approved_drug_descriptors.csv')

In [3]:
descriptors_df

Unnamed: 0,DrugBank ID,Name,SMILES,ALogPS_logP,ALogPS_logS,nA:(CDK2),nR:(CDK2),nN:(CDK2),nD:(CDK2),nC:(CDK2),...,SYMS4:(Mersy),SYMS5X:(Mersy),SYMS5Y:(Mersy),SYMS5Z:(Mersy),SYMS5:(Mersy),SYMS6X:(Mersy),SYMS6Y:(Mersy),SYMS6Z:(Mersy),SYMS6:(Mersy),CHIR:(Mersy)
0,DB00007,leuprolide,CCNC(=O)[C@@H]1CCCN1C(=O)[C@@H](NC(=O)[C@@H](N...,1.04,-4.55,9.0,1.0,0.0,0.0,0.0,...,0.300,0.204,0.260,0.402,0.294,0.214,0.260,0.401,0.297,0.652
1,DB00014,goserelin,OC[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](...,0.30,-4.65,9.0,0.0,0.0,0.0,0.0,...,0.283,0.247,0.246,0.339,0.279,0.255,0.260,0.340,0.286,0.648
2,DB00035,desmopressin,NC(=O)C[C@@H]1NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H]...,-1.01,-3.99,7.0,1.0,1.0,0.0,1.0,...,0.297,0.272,0.314,0.305,0.297,0.272,0.308,0.309,0.296,0.670
3,DB00091,cyclosporine,C/C=C/C[C@H]([C@H]([C@H]1C(=O)N[C@@H](CC)C(=O)...,4.12,-5.09,10.0,0.0,0.0,0.0,0.0,...,0.448,0.453,0.503,0.374,0.446,0.454,0.529,0.388,0.460,0.489
4,DB00104,octreotide,NCCCC[C@@H]1NC(=O)[C@H](NC(=O)[C@H](Cc2ccccc2)...,0.42,-4.93,7.0,0.0,0.0,0.0,2.0,...,0.348,0.350,0.364,0.323,0.346,0.362,0.353,0.326,0.347,0.610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299,DB15598,ferric maltol,O=c1ccoc(c1[O-])C.O=c1ccoc(c1[O-])C.O=c1ccoc(c...,-0.24,0.03,0.0,0.0,0.0,0.0,0.0,...,0.706,0.781,0.605,0.655,0.690,0.792,0.617,0.672,0.703,0.210
2300,DB15617,ferric derisomaltose,OC[C@@H]([C@H]([C@@H]([C@@H](CO[C@H]1O[C@H](CO...,-3.17,-0.25,0.0,0.0,0.0,0.0,0.0,...,0.451,0.483,0.392,0.481,0.454,0.493,0.397,0.482,0.459,0.504
2301,DB15678,calcium undecylenate,C=CCCCCCCCCC(=O)[O-].C=CCCCCCCCCC(=O)[O-].[Ca+2],3.84,-3.99,0.0,0.0,0.0,0.0,0.0,...,0.676,0.585,0.566,0.756,0.647,0.601,0.582,0.766,0.661,0.237
2302,DB15685,selpercatinib,N#Cc1cnn2c1c(cc(c2)OCC(O)(C)C)c1ccc(nc1)N1CC2C...,3.03,-4.24,0.0,0.0,0.0,0.0,0.0,...,0.326,0.315,0.258,0.407,0.330,0.294,0.260,0.398,0.320,0.624


No need to run the cell below if you have already ran it before and written the descriptors to a file.

In [35]:
# dropping useless columns
descriptors_df.drop(columns=['CASRN', 'EXTERNALID', 'N', 'NAME', 'NAME.1', 'ARTICLEID', 'PUBMEDID', 'PAGE', 'TABLE'], inplace=True)
# counting the number of columns that are dropped due to errors
num_dropped = descriptors_df.loc[descriptors_df['ERROR'] != '-'].shape[0]
print(f'Dropping {num_dropped} drugs due to errors in desc calculation')
# removing drugs that gave an error
descriptors_df = descriptors_df.loc[descriptors_df['ERROR'] == '-'].drop(columns=['ERROR'])
descriptors_df.dropna(inplace=True)
descriptors_df.reset_index(inplace=True, drop=True)
descriptors_df

Dropping 176 drugs due to errors in desc calculation


Unnamed: 0,SMILES,ALogPS_logP,ALogPS_logS,nA:(CDK2),nR:(CDK2),nN:(CDK2),nD:(CDK2),nC:(CDK2),nF:(CDK2),nQ:(CDK2),...,SYMS4:(Mersy),SYMS5X:(Mersy),SYMS5Y:(Mersy),SYMS5Z:(Mersy),SYMS5:(Mersy),SYMS6X:(Mersy),SYMS6Y:(Mersy),SYMS6Z:(Mersy),SYMS6:(Mersy),CHIR:(Mersy)
0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,1.04,-4.55,9.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.3,0.204,0.26,0.402,0.294,0.214,0.26,0.401,0.297,0.652
1,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,0.3,-4.65,9.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.283,0.247,0.246,0.339,0.279,0.255,0.26,0.34,0.286,0.648
2,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-1.01,-3.99,7.0,1.0,1.0,0.0,1.0,4.0,1.0,...,0.297,0.272,0.314,0.305,0.297,0.272,0.308,0.309,0.296,0.67
3,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,4.12,-5.09,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.448,0.453,0.503,0.374,0.446,0.454,0.529,0.388,0.46,0.489
4,C[C@@H](O)[C@@H](CO)NC(=O)[C@@H]1CSSC[C@H](NC(...,0.42,-4.93,7.0,0.0,0.0,0.0,2.0,4.0,0.0,...,0.348,0.35,0.364,0.323,0.346,0.362,0.353,0.326,0.347,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,[Fe+3].CC1=C([O-])C(=O)C=CO1.CC1=C([O-])C(=O)C...,-0.24,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.706,0.781,0.605,0.655,0.69,0.792,0.617,0.672,0.703,0.21
2440,[Fe+3].OC[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO[C...,-3.17,-0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.451,0.483,0.392,0.481,0.454,0.493,0.397,0.482,0.459,0.504
2441,[Ca++].[O-]C(=O)CCCCCCCCC=C.[O-]C(=O)CCCCCCCCC=C,3.84,-3.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.676,0.585,0.566,0.756,0.647,0.601,0.582,0.766,0.661,0.237
2442,COC1=NC=C(CN2C3CC2CN(C3)C2=CC=C(C=N2)C2=CC(OCC...,3.03,-4.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.326,0.315,0.258,0.407,0.33,0.294,0.26,0.398,0.32,0.624


Writing SMILES in `descriptor_df` to .smi file to convert the SMILES to canonical SMILES (.can). Adding the SMILES back into the dataframe after

In [18]:
with open('./drug_target_data/drug_structures/descriptor_smiles.smi', 'w') as f:
    for i, r in descriptors_df.iterrows():
        f.write(r['SMILES'] + '\n')

Reading canonical SMILES after conversion and adding them back into the descriptors dataframe (they are in the same order)

In [6]:
descriptor_unique_smiles = read_smi('./drug_target_data/drug_structures/descriptor_smiles.can')
descriptors_df['SMILES'] = descriptor_unique_smiles['SMILES']

Reading drug SMILES data from DrugBank to add drug names to the descriptor DataFrame

In [29]:
def read_smi2(file_path):
    ids = []
    smi = []
    name = []
    with open(file_path, "r") as f:
        for line in f:
            line = line.replace('\n', '').split(';')
            name.append(line[1].lower())
            line[0] = line[0].split('\t')
            smi.append(line[0][0])
            ids.append(line[0][1])
    return pd.DataFrame(data={'DrugBank ID':ids, "Name":name, "SMILES":smi})

In [37]:
smi_df = read_smi2('./drug_target_data/drug_structures/approved_drug.can')
# removing drugs that are not in the descriptor DataFrame
print(f"Removing {smi_df.shape[0]-smi_df.loc[smi_df['SMILES'].isin(descriptors_df['SMILES'])].shape[0]} drugs from smi_df because they are not in the descriptors DataFrame")
smi_df = smi_df.loc[smi_df['SMILES'].isin(descriptors_df['SMILES'])]

Removing 164 drugs from smi_df because they are not in the descriptors DataFrame


In [41]:
descriptors_df = pd.merge(smi_df, descriptors_df, how='right', on=['SMILES'])
print(f"Merged the dataframes to add drug names to the descriptors. Removing {descriptors_df['DrugBank ID'].isna().sum()} drugs that don't have IDs (only SMILES)")
descriptors_df.dropna(subset=['DrugBank ID'], inplace=True)

Merged the dataframes to add drug names to the descriptors. Removing 140 drugs that don't have IDs (only SMILES)


In [48]:
smi_df

Unnamed: 0,DrugBank ID,Name,SMILES
1,DB00007,leuprolide,CCNC(=O)[C@@H]1CCCN1C(=O)[C@@H](NC(=O)[C@@H](N...
2,DB00014,goserelin,OC[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](...
4,DB00035,desmopressin,NC(=O)C[C@@H]1NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H]...
8,DB00091,cyclosporine,C/C=C/C[C@H]([C@H]([C@H]1C(=O)N[C@@H](CC)C(=O)...
9,DB00104,octreotide,NCCCC[C@@H]1NC(=O)[C@H](NC(=O)[C@H](Cc2ccccc2)...
...,...,...,...
2462,DB15598,ferric maltol,O=c1ccoc(c1[O-])C.O=c1ccoc(c1[O-])C.O=c1ccoc(c...
2463,DB15617,ferric derisomaltose,OC[C@@H]([C@H]([C@@H]([C@@H](CO[C@H]1O[C@H](CO...
2464,DB15678,calcium undecylenate,C=CCCCCCCCCC(=O)[O-].C=CCCCCCCCCC(=O)[O-].[Ca+2]
2466,DB15685,selpercatinib,N#Cc1cnn2c1c(cc(c2)OCC(O)(C)C)c1ccc(nc1)N1CC2C...


In [45]:
descriptors_df

Unnamed: 0,DrugBank ID,Name,SMILES,ALogPS_logP,ALogPS_logS,nA:(CDK2),nR:(CDK2),nN:(CDK2),nD:(CDK2),nC:(CDK2),...,SYMS4:(Mersy),SYMS5X:(Mersy),SYMS5Y:(Mersy),SYMS5Z:(Mersy),SYMS5:(Mersy),SYMS6X:(Mersy),SYMS6Y:(Mersy),SYMS6Z:(Mersy),SYMS6:(Mersy),CHIR:(Mersy)
0,DB00007,leuprolide,CCNC(=O)[C@@H]1CCCN1C(=O)[C@@H](NC(=O)[C@@H](N...,1.04,-4.55,9.0,1.0,0.0,0.0,0.0,...,0.3,0.204,0.26,0.402,0.294,0.214,0.26,0.401,0.297,0.652
1,DB00014,goserelin,OC[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](...,0.3,-4.65,9.0,0.0,0.0,0.0,0.0,...,0.283,0.247,0.246,0.339,0.279,0.255,0.26,0.34,0.286,0.648
2,DB00035,desmopressin,NC(=O)C[C@@H]1NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H]...,-1.01,-3.99,7.0,1.0,1.0,0.0,1.0,...,0.297,0.272,0.314,0.305,0.297,0.272,0.308,0.309,0.296,0.67
3,DB00091,cyclosporine,C/C=C/C[C@H]([C@H]([C@H]1C(=O)N[C@@H](CC)C(=O)...,4.12,-5.09,10.0,0.0,0.0,0.0,0.0,...,0.448,0.453,0.503,0.374,0.446,0.454,0.529,0.388,0.46,0.489
4,DB00104,octreotide,NCCCC[C@@H]1NC(=O)[C@H](NC(=O)[C@H](Cc2ccccc2)...,0.42,-4.93,7.0,0.0,0.0,0.0,2.0,...,0.348,0.35,0.364,0.323,0.346,0.362,0.353,0.326,0.347,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,DB15598,ferric maltol,O=c1ccoc(c1[O-])C.O=c1ccoc(c1[O-])C.O=c1ccoc(c...,-0.24,0.03,0.0,0.0,0.0,0.0,0.0,...,0.706,0.781,0.605,0.655,0.69,0.792,0.617,0.672,0.703,0.21
2440,DB15617,ferric derisomaltose,OC[C@@H]([C@H]([C@@H]([C@@H](CO[C@H]1O[C@H](CO...,-3.17,-0.25,0.0,0.0,0.0,0.0,0.0,...,0.451,0.483,0.392,0.481,0.454,0.493,0.397,0.482,0.459,0.504
2441,DB15678,calcium undecylenate,C=CCCCCCCCCC(=O)[O-].C=CCCCCCCCCC(=O)[O-].[Ca+2],3.84,-3.99,0.0,0.0,0.0,0.0,0.0,...,0.676,0.585,0.566,0.756,0.647,0.601,0.582,0.766,0.661,0.237
2442,DB15685,selpercatinib,N#Cc1cnn2c1c(cc(c2)OCC(O)(C)C)c1ccc(nc1)N1CC2C...,3.03,-4.24,0.0,0.0,0.0,0.0,0.0,...,0.326,0.315,0.258,0.407,0.33,0.294,0.26,0.398,0.32,0.624


In [46]:
# writing the cleaned descriptors DataFrame to the original file
descriptors_df.to_csv('./drug_target_data/drug_descriptors/approved_drug_descriptors.csv', index=False)