In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

In [2]:
pubs_data = pd.read_csv(filepath_or_buffer='./raw_data/SARS-CoV-2_3CLpro_inhibitors.csv')
pubs_smiles_data = pd.read_csv(filepath_or_buffer='./raw_data/sars-cov-2_smiles.csv')

print(pubs_data.shape)
pubs_data.head()

(39, 9)


Unnamed: 0,compound name,3CLpro inhibition IC50(μM),max resp (%),SARS-CoV-2 CPE EC50(μM),efficacy (%),Vero E6 cytotoxicity CC50(μM),cytotoxicity efficacy (%),stage of compound,SI_μM
0,walrycin B,0.26,86.6,3.55,51.43,4.25,99.67,research,
1,hydroxocobalamin,3.29,89.56,,0.0,,0.0,US FDA approved,
2,suramin sodium,6.5,99.49,,0.0,,0.0,clinical,
3,Z-DEVD-FMK,6.81,90.48,,0.0,,0.0,research,
4,LLL-12,9.84,82.98,,0.0,1.77,100.0,research,


In [3]:
print(pubs_smiles_data.shape)
pubs_smiles_data.head()

(39, 2)


Unnamed: 0,Inhibitor,SMILES
0,walrycin B,CN1C2=NC(=O)N(C(=O)C2=NC(=N1)C3=CC=C(C=C3)C(F)...
1,hydroxocobalamin,CC1=CC2=C(C=C1C)N(C=N2)C3C(C(C(O3)CO)OP(=O)([O...
2,suramin sodium,CC1=C(C=C(C=C1)C(=O)NC2=C3C(=CC(=CC3=C(C=C2)S(...
3,Z-DEVD-FMK,CC(C)C(C(=O)NC(CC(=O)OC)C(=O)CF)NC(=O)C(CCC(=O...
4,LLL-12,C1=CC2=C(C(=C1)O)C(=O)C3=C(C2=O)C(=CC=C3)S(=O)...


In [4]:
full_df = pd.merge(left=pubs_data, right=pubs_smiles_data, left_on='compound name', right_on='Inhibitor')

full_df.drop(columns=['max resp (%)', 'efficacy (%)', 'SARS-CoV-2 CPE EC50(μM)', 'cytotoxicity efficacy (%)', 'stage of compound', 'Inhibitor'], inplace=True)

new_col_names = ['compound_name', 'IC50_μM', 'CC50_μM', 'SI', 'smiles']

full_df.columns = new_col_names

# full_df = full_df[['compound_name', 'smiles', 'IC50_μM', 'CC50_μM', 'SI']]

full_df = full_df[(~full_df['CC50_μM'].isnull()) | (~full_df['SI'].isnull())]

full_df.head()

Unnamed: 0,compound_name,IC50_μM,CC50_μM,SI,smiles
0,walrycin B,0.26,4.25,,CN1C2=NC(=O)N(C(=O)C2=NC(=N1)C3=CC=C(C=C3)C(F)...
4,LLL-12,9.84,1.77,,C1=CC2=C(C(=C1)O)C(=O)C3=C(C2=O)C(=CC=C3)S(=O)...
6,DA-3003–1,2.63,7.74,,O=C(C(Cl)=C1NCCN2CCOCC2)C3=C(N=CC=C3)C1=O
8,fascaplysin,9.96,1.26,,C1=CC=C2C(=C1)C3=C(N2)C4=[N+](C=C3)C5=CC=CC=C5...
9,MG-115,12.7,1.13,,CCCC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=O)O...


In [5]:
full_df['SI'] = full_df['SI'].fillna(full_df['CC50_μM'] / full_df['IC50_μM']).round(2)

full_df = full_df[['compound_name', 'smiles', 'IC50_μM', 'CC50_μM', 'SI']]

full_df.head()

Unnamed: 0,compound_name,smiles,IC50_μM,CC50_μM,SI
0,walrycin B,CN1C2=NC(=O)N(C(=O)C2=NC(=N1)C3=CC=C(C=C3)C(F)...,0.26,4.25,16.35
4,LLL-12,C1=CC2=C(C(=C1)O)C(=O)C3=C(C2=O)C(=CC=C3)S(=O)...,9.84,1.77,0.18
6,DA-3003–1,O=C(C(Cl)=C1NCCN2CCOCC2)C3=C(N=CC=C3)C1=O,2.63,7.74,2.94
8,fascaplysin,C1=CC=C2C(=C1)C3=C(N2)C4=[N+](C=C3)C5=CC=CC=C5...,9.96,1.26,0.13
9,MG-115,CCCC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=O)O...,12.7,1.13,0.09


In [6]:
full_df.to_csv('./processed_data/sars-cov-2_inhibitors_pubs.csv',index=False)