In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, MACCSkeys
from rdkit.Chem import rdFingerprintGenerator
import joblib

In [2]:
model = joblib.load(filename="./sars-cov-2_SI_predictive_model.joblib")
data = pd.read_csv("./input/data.csv")
data

Unnamed: 0,IC50 mg/ml,CC50 mg/ml,SI,Cell,Virus,Strain,DOI,smiles
0,0.015,0.1,6.67,Vero,Ebola,Zaire,10.1038/s41586-020-2577-6,C1=CC=C(C=C1)C(=O)O
1,0.025,0.15,6.0,A549,SARS-CoV-2,Wuhan,10.1007/s11427-021-1911-6,C1CCC(CC1)C(=O)O
2,0.035,0.3,8.57,Vero,HIV-1,NL4-3,10.1038/s41564-020-0701-7,C1=CC=C(C=C1)O
3,0.05,0.25,5.0,A549,Zika,MR766,10.1093/cid/ciaa345,C1=CC=C(C=C1)CC(=O)O
4,0.03,0.18,6.0,Vero,Influenza,H1N1,10.1093/jac/dkz177,C1CC(CCC1)O
5,0.045,0.22,4.89,A549,MERS-CoV,EMC/2012,10.1016/j.cell.2020.09.011,C1=CC(=CC=C1)N
6,0.02,0.12,6.0,Vero,Dengue,DENV-2,10.1016/j.virol.2021.03.001,C1=CC=C(C=C1)Cl
7,0.04,0.26,6.5,A549,SARS-CoV,Frankfurt,10.1038/s41587-020-0603-8,C1CCC(CC1)N
8,0.033,0.2,6.06,Vero,H1N1,Puerto Rico,10.1016/j.cmet.2020.09.018,C1=CC=C(C=C1)F
9,0.05,0.28,5.6,A549,Chikungunya,La Reunion,10.1016/j.jmb.2020.05.017,C1=CC=C(C=C1)Br


In [3]:
def extract_morgan_fingerprint(smiles, radius=2, nBits=2048):
    # Преобразуем SMILES в молекулу
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        return None
    
    # Создаем генератор Morgan fingerprints
    fpgen = AllChem.GetMorganGenerator(radius=radius, fpSize=nBits)
    
    # Генерируем fingerprint в виде битовой векторной формы
    morgan_fp = fpgen.GetFingerprint(mol)
    
    # Преобразуем результат в массив numpy
    morgan_fp_arr = np.array(morgan_fp)
    
    return morgan_fp_arr

def extract_features(smiles):
    # Преобразование SMILES в молекулу RDKit
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        return None
    
    # Вычисление молекулярной массы
    molecular_weight = Descriptors.MolWt(mol)
    
    # Вычисление LogP (коэффициента распределения)
    logP = Descriptors.MolLogP(mol)
    
    # Число водородных доноров (HBD)
    hbd = Descriptors.NumHDonors(mol)
    
    # Число водородных акцепторов (HBA)
    hba = Descriptors.NumHAcceptors(mol)
    
    # Топологическая полярная поверхность (TPSA)
    tpsa = Descriptors.TPSA(mol)
    
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    fingerprint = rdkit_gen.GetFingerprint(mol)
    
    

    return {
        'MolecularWeight': molecular_weight,
        'LogP': logP,
        'HBD': hbd,
        'HBA': hba,
        'TPSA': tpsa,
        'fingerprint' : fingerprint
    }

def dataframe_processing(df):
    tmp = df.drop(columns=['IC50 mg/ml', 'CC50 mg/ml', 'SI', 'Cell', 'Virus', 'Strain', 'DOI'], axis=1)
    features_df = tmp['smiles'].apply(extract_features)

    features_expanded_df = pd.DataFrame(features_df.tolist())


    tmp.reset_index(drop=True, inplace=True)
    features_expanded_df.reset_index(drop=True, inplace=True)

    complete_data_with_features = pd.concat([tmp, features_expanded_df], axis=1, ignore_index=False)
    
    
    compounds = []
    for smile, mw, logp, hbd, hba, tpsa in complete_data_with_features[['smiles', 'MolecularWeight', 'LogP', 'HBD', 'HBA', 'TPSA']].itertuples(index=False):
        compounds.append((Chem.MolFromSmiles(smile), mw, logp, hbd, hba, tpsa))
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    fingerprints = np.array([rdkit_gen.GetFingerprint(mol) for mol, _, _, _, _, _ in compounds])
    properties = np.array([[mw, logp, hbd, hba, tpsa] for _, mw, logp, hbd, hba, tpsa in compounds])


    combined_data = np.hstack((fingerprints, properties))

    
    
    return combined_data
    
    

In [4]:
X = dataframe_processing(data)
y = model.predict(X)

In [5]:
# Создаем DataFrame из предсказаний
df = pd.DataFrame(y, columns=['SI_prediction'])

# Сохраняем DataFrame в CSV файл
df.to_csv('./output/predictions.csv', index=False)