In [1]:
import pandas as pd

In [2]:
smiles_df = pd.read_csv(filepath_or_buffer='./raw_data/chemical_data_smiles.csv')
data_df = pd.read_csv(filepath_or_buffer='./raw_data/chemical_data.csv')

smiles_df.head()

Unnamed: 0,inhibitor_name,smiles
0,Clemastine,C[C@@](C1=CC=CC=C1)(C2=CC=C(C=C2)Cl)OCC[C@H]3C...
1,Amiodarone,CCCCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)O...
2,Trimeprazine,CC(CN1C2=CC=CC=C2SC3=CC=CC=C31)CN(C)C
3,Bosutinib,CN1CCN(CC1)CCCOC2=C(C=C3C(=C2)N=CC(=C3NC4=CC(=...
4,Toremifene,CN(C)CCOC1=CC=C(C=C1)/C(=C(/CCCl)\C2=CC=CC=C2)...


In [3]:
data_df.drop(columns=['EC90 (µM)'], axis=1, inplace=True)

data_df.columns = ['inhibitor_name', 'CC50_µM', 'EC50_µM', 'SI']

data_df.head()

Unnamed: 0,inhibitor_name,CC50_µM,EC50_µM,SI
0,Clemastine,5.9,0.81,7.96
1,Amiodarone,5.95,0.21,10.63
2,Trimeprazine,13.82,0.97,14.21
3,Bosutinib,17.12,1.3,13.58
4,Toremifene,17.14,1.92,8.92


In [4]:
def string_parser(x):
    if ('~' in x):
        return float(x.split('~')[1])
    elif ('±' in x):
        return float(x.split('±')[0])
    elif ('≥' in x):
        return float(x.split('≥')[1])
    elif ('>' in x):
        return float(x.split('>')[1])
    elif ('≤' in x):
        return float(x.split('≤')[1])
    return float(x)

In [5]:
# filtered_data_df = data_df.copy() 

# Convert IC50 and CC50 to float after parsing
data_df['EC50_µM'] = data_df['EC50_µM'].apply(lambda x: string_parser(x)).astype(float)
data_df['CC50_µM'] = data_df['CC50_µM'].apply(lambda x: string_parser(x)).astype(float)
data_df['SI'] = data_df['SI'].apply(lambda x: string_parser(x)).astype(float)



# Check the data types
data_df.head()

Unnamed: 0,inhibitor_name,CC50_µM,EC50_µM,SI
0,Clemastine,5.9,0.81,7.96
1,Amiodarone,5.95,0.21,10.63
2,Trimeprazine,13.82,0.97,14.21
3,Bosutinib,17.12,1.3,13.58
4,Toremifene,17.14,1.92,8.92


In [6]:
full_df = pd.merge(left=data_df, right=smiles_df, left_on='inhibitor_name', right_on='inhibitor_name')

full_df = full_df[['inhibitor_name', 'smiles', 'CC50_µM', 'EC50_µM', 'SI']]

full_df.drop_duplicates(subset=['inhibitor_name'],inplace=True)

full_df.head()

Unnamed: 0,inhibitor_name,smiles,CC50_µM,EC50_µM,SI
0,Clemastine,C[C@@](C1=CC=CC=C1)(C2=CC=C(C=C2)Cl)OCC[C@H]3C...,5.9,0.81,7.96
2,Amiodarone,CCCCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)O...,5.95,0.21,10.63
4,Trimeprazine,CC(CN1C2=CC=CC=C2SC3=CC=CC=C31)CN(C)C,13.82,0.97,14.21
6,Bosutinib,CN1CCN(CC1)CCCOC2=C(C=C3C(=C2)N=CC(=C3NC4=CC(=...,17.12,1.3,13.58
8,Toremifene,CN(C)CCOC1=CC=C(C=C1)/C(=C(/CCCl)\C2=CC=CC=C2)...,17.14,1.92,8.92


In [7]:
full_df.to_csv('./processed_data/sars-cov-2_inhibitors_nature.csv', index=False)