# **Обработка данных о соединениях-ингибиторах с сайта**
    https://www.sciencedirect.com/science/article/pii/S2211383521002483

In [7]:
import pandas as pd

In [8]:
table1_filepath = './raw_data/table1.csv'
table2_filepath = './raw_data/table2.csv'
table3_filepath = './raw_data/table3.csv'
table1_smiles_filepath = './raw_data/table1_smiles.csv'
table2_smiles_filepath = './raw_data/table2_smiles.csv'
table3_smiles_filepath = './raw_data/table3_smiles.csv'



table1_df = pd.read_csv(filepath_or_buffer=table1_filepath)
table2_df = pd.read_csv(filepath_or_buffer=table2_filepath)
table3_df = pd.read_csv(filepath_or_buffer=table3_filepath)
table1_smiles_df = pd.read_csv(filepath_or_buffer=table1_smiles_filepath, sep=';')
table2_smiles_df = pd.read_csv(filepath_or_buffer=table2_smiles_filepath, sep=';')
table3_smiles_df = pd.read_csv(filepath_or_buffer=table3_smiles_filepath, sep=',')

table1_df = table1_df[['Inhibitor', 'Activity IC50 (μmol/L)', 'Toxicity CC50 (μmol/L)']]
table2_df = table2_df[['Inhibitor', 'Activity IC50 (μmol/L)', 'Toxicity CC50 (μmol/L)']]
table3_df = table3_df[['Compound', 'IC50', 'EC50']]

new_column_names = ['inhibitor_name', 'IC50_μmol/L', 'CC50_μmol/L']
new_smiles_names = ['inhibitor_name', 'smiles']
table1_df.columns = new_column_names
table2_df.columns = new_column_names
table3_df.columns = new_column_names
table1_smiles_df.columns = new_smiles_names
table2_smiles_df.columns = new_smiles_names
table3_smiles_df.columns = new_smiles_names

table1_df.head()

Unnamed: 0,inhibitor_name,IC50_μmol/L,CC50_μmol/L
0,Salvianolic acid C (Sal-C),3.41,≥100
1,Arbidol,4.11,31.79
2,DRI-C23041,5.6,≥135
3,Cepharanthine,1.41,11.22
4,In vitro,–,Preclinical


In [9]:
def string_parser(x):
    if ('~' in x):
        return float(x.split('~')[1])
    elif ('±' in x):
        return float(x.split('±')[0])
    elif ('≥' in x):
        return float(x.split('≥')[1])
    elif ('>' in x):
        return float(x.split('>')[1])
    elif ('≤' in x):
        return float(x.split('≤')[1])
    return float(x)

In [10]:
res_df = pd.concat([table1_df, table2_df, table3_df], axis=0)

res_filtered_df = res_df[
    (res_df['IC50_μmol/L'] != '–') & 
    (res_df['IC50_μmol/L'] != 'Preclinical') & 
    (res_df['CC50_μmol/L'] != 'Preclinical') & 
    (res_df['CC50_μmol/L'] != '–')
].copy() 

# Convert IC50 and CC50 to float after parsing
res_filtered_df['IC50_μmol/L'] = res_filtered_df['IC50_μmol/L'].apply(lambda x: string_parser(x)).astype(float)
res_filtered_df['CC50_μmol/L'] = res_filtered_df['CC50_μmol/L'].apply(lambda x: string_parser(x)).astype(float)

# Perform the SI calculation and round it
res_filtered_df['SI'] = res_filtered_df['CC50_μmol/L'] / res_filtered_df['IC50_μmol/L']
res_filtered_df['SI'] = res_filtered_df['SI'].map('{:.2f}'.format).astype(float)

# Check the data types
res_filtered_df.head()

Unnamed: 0,inhibitor_name,IC50_μmol/L,CC50_μmol/L,SI
0,Salvianolic acid C (Sal-C),3.41,100.0,29.33
1,Arbidol,4.11,31.79,7.73
2,DRI-C23041,5.6,135.0,24.11
3,Cepharanthine,1.41,11.22,7.96
6,Abemaciclib,3.16,7.08,2.24


In [11]:
res_smiles = pd.concat([table1_smiles_df, table2_smiles_df, table3_smiles_df], axis=0)


res_smiles = res_smiles[~res_smiles['smiles'].isnull()]


full_df = pd.merge(left=res_filtered_df, right=res_smiles, how='right')

full_df = full_df[['inhibitor_name', 'smiles', 'IC50_μmol/L', 'CC50_μmol/L', 'SI']]

print(full_df.isnull().sum())

full_df = full_df[full_df['SI'].isnull() != True]
full_df.head()

inhibitor_name     0
smiles             0
IC50_μmol/L       18
CC50_μmol/L       18
SI                18
dtype: int64


Unnamed: 0,inhibitor_name,smiles,IC50_μmol/L,CC50_μmol/L,SI
0,Salvianolic acid C (Sal-C),C1=CC(=C(C=C1CC(C(=O)O)OC(=O)C=CC2=C(C(=C(C=C2...,3.41,100.0,29.33
1,Arbidol,CCOC(=O)C1=C(N(C2=CC(=C(C(=C21)CN(C)C)O)Br)C)C...,4.11,31.79,7.73
2,Cepharanthine,CN1CCC2=CC3=C(C4=C2C1CC5=CC=C(C=C5)OC6=C(C=CC(...,1.41,11.22,7.96
3,Abemaciclib,CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC...,3.16,7.08,2.24
4,Osimertinib,CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=...,3.98,10.0,2.51


## **Выгрузка обработанных данных**

In [12]:
# Сохраняем DataFrame в CSV
full_df.to_csv('./processed_data/sars-cov-2_inhibitors_sciencedirect.csv', index=False)
