In [4]:
import pandas as pd
import numpy as np

In [5]:
df=pd.read_csv('bace2-IC50.csv',sep=';' ,usecols=["Smiles", "Standard Type", "Standard Value"])
df.head()

Unnamed: 0,Smiles,Standard Type,Standard Value
0,CCC(=O)c1cc(C2(c3cccc(-c4cccnc4F)c3)N=C(N)N(C)...,IC50,70.0
1,CN1C(=O)C(c2ccsc2)(c2ccc(F)c(-c3cccnc3F)c2)N=C1N,IC50,410.0
2,CCn1cc(C2(c3ccc(F)c(-c4cccnc4F)c3)N=C(N)N(C)C2...,IC50,870.0
3,CC(C)CCNc1ccc(F)c([C@]2(C)COCC(N)=N2)c1,IC50,605.0
4,CC1(c2cccc(Nc3cccc4oc(C5CC5)nc34)c2)COCC(N)=N1,IC50,300.0


In [6]:
df1=pd.read_csv('bace2-Ki.csv',sep=';' ,usecols=["Smiles", "Standard Type", "Standard Value"])
df1.head()

Unnamed: 0,Smiles,Standard Type,Standard Value
0,CC1(C)C(=N)N[C@](C)(c2cc(NC(=O)c3ccc(Cl)cn3)cc...,Ki,1.0
1,CC(C)c1ccc(-c2nnc(-c3cc(Cl)c([C@]4(C)CS(=O)(=O...,Ki,889.4
2,COc1cccc(-c2cccc(C3(C4CC4)NC(=N)N(C)C3=O)c2)c1,Ki,7.8
3,COCC(=O)Nc1ccc(F)c([C@]2(C)CS(=O)(=O)C3(CCN(C)...,Ki,56.0
4,CC1(C)C(=N)N[C@@]2(c3cc(NC(=O)c4ccc(Cl)cn4)ccc...,Ki,1.0


In [7]:
df = pd.concat([df, df1], ignore_index=True)
df.to_csv("combined_data.csv", index=False)

In [8]:
df2 = pd.read_csv("combined_data.csv")
df2.shape

(1654, 3)

In [9]:
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, RDKFingerprint

In [10]:
USE_MORGAN = True

fingerprints_list = []

for i, row in df2.iterrows():
    smiles = row['Smiles']
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        continue

    fp_data = {"SMILES": smiles}

    if USE_MORGAN:
        morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        for j in range(morgan_fp.GetNumBits()):
            fp_data[f'Morgan_{j}'] = int(morgan_fp[j])

    fingerprints_list.append(fp_data)





In [11]:
fingerprints_df = pd.DataFrame(fingerprints_list)

In [12]:
merge_df = pd.concat([df2, fingerprints_df], axis=1)
merge_df.pop('SMILES')
merge_df.pop('Smiles')
merge_df.to_csv("fingerprints_data.csv", index=False)

In [13]:
merge_df.head()

Unnamed: 0,Standard Type,Standard Value,Morgan_0,Morgan_1,Morgan_2,Morgan_3,Morgan_4,Morgan_5,Morgan_6,Morgan_7,...,Morgan_2038,Morgan_2039,Morgan_2040,Morgan_2041,Morgan_2042,Morgan_2043,Morgan_2044,Morgan_2045,Morgan_2046,Morgan_2047
0,IC50,70.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,IC50,410.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,IC50,870.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,IC50,605.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,IC50,300.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
merge_df.shape

(1654, 2050)