In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('bace2-IC50.csv',sep=';' ,usecols=["Smiles", "Standard Type", "Standard Value"])
df.head()

Unnamed: 0,Smiles,Standard Type,Standard Value
0,CCC(=O)c1cc(C2(c3cccc(-c4cccnc4F)c3)N=C(N)N(C)...,IC50,70.0
1,CN1C(=O)C(c2ccsc2)(c2ccc(F)c(-c3cccnc3F)c2)N=C1N,IC50,410.0
2,CCn1cc(C2(c3ccc(F)c(-c4cccnc4F)c3)N=C(N)N(C)C2...,IC50,870.0
3,CC(C)CCNc1ccc(F)c([C@]2(C)COCC(N)=N2)c1,IC50,605.0
4,CC1(c2cccc(Nc3cccc4oc(C5CC5)nc34)c2)COCC(N)=N1,IC50,300.0


In [3]:
df1=pd.read_csv('bace2-Ki.csv',sep=';' ,usecols=["Smiles", "Standard Type", "Standard Value"])
df1.head()

Unnamed: 0,Smiles,Standard Type,Standard Value
0,CC1(C)C(=N)N[C@](C)(c2cc(NC(=O)c3ccc(Cl)cn3)cc...,Ki,1.0
1,CC(C)c1ccc(-c2nnc(-c3cc(Cl)c([C@]4(C)CS(=O)(=O...,Ki,889.4
2,COc1cccc(-c2cccc(C3(C4CC4)NC(=N)N(C)C3=O)c2)c1,Ki,7.8
3,COCC(=O)Nc1ccc(F)c([C@]2(C)CS(=O)(=O)C3(CCN(C)...,Ki,56.0
4,CC1(C)C(=N)N[C@@]2(c3cc(NC(=O)c4ccc(Cl)cn4)ccc...,Ki,1.0


In [4]:
df = pd.concat([df, df1], ignore_index=True)
df.to_csv("combined_data.csv", index=False)

In [5]:
df2 = pd.read_csv("combined_data.csv")
df2.shape

(1654, 3)

In [6]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [7]:
smiles = df2['Smiles'].tolist()

mgfingerprints = []

fp_data = {"SMILES": smiles}
for smi in smiles:
	mol = Chem.MolFromSmiles(smi)
	if mol is not None:
		morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
		for j in range(morgan_fp.GetNumBits()):
			fp_data[f'Morgan_{j}'] = int(morgan_fp[j])
		arr = np.array(morgan_fp)
	else:
		arr = np.zeros(2048, dtype=int)
	mgfingerprints.append(arr)

mgfingerprints = np.array(mgfingerprints)
mgfingerprint_df = pd.DataFrame(mgfingerprints, columns=[f"Morgan_{i}" for i in range(mgfingerprints.shape[1])])
df3 = pd.concat([df2.reset_index(drop=True), mgfingerprint_df], axis=1)



In [8]:
from skfp.fingerprints import MACCSFingerprint, E3FPFingerprint
from skfp.preprocessing import MolFromSmilesTransformer, ConformerGenerator
from sklearn.pipeline import make_pipeline, make_union

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
smiles = df3['Smiles'].tolist()

maccs_fp = MACCSFingerprint()
maccs_fps = maccs_fp.transform(smiles)
maccsfingerprint_df = pd.DataFrame(maccs_fps, columns=[f"MACCS_{i}" for i in range(maccs_fps.shape[1])])
df3 = pd.concat([df3.reset_index(drop=True), maccsfingerprint_df], axis=1)

mol_transformer = MolFromSmilesTransformer()
mols = mol_transformer.transform(smiles)

conf_gen = ConformerGenerator()
mols_with_conf = conf_gen.transform(mols)

e3fp_fp = E3FPFingerprint()
e3fp_fps = e3fp_fp.transform(mols_with_conf)
e3fpfingerprint_df = pd.DataFrame(e3fp_fps, columns=[f"e3fp_{i}" for i in range(e3fp_fps.shape[1])])
df3 = pd.concat([df3.reset_index(drop=True), e3fpfingerprint_df], axis=1)

[11:22:13] UFFTYPER: Unrecognized atom type: S_5+4 (26)
[11:22:14] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:34] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:36] UFFTYPER: Unrecognized atom type: S_5+4 (3)
[11:22:39] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:39] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:40] UFFTYPER: Unrecognized atom type: S_5+4 (3)
[11:22:40] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:45] UFFTYPER: Unrecognized atom type: S_5+4 (5)
[11:22:46] UFFTYPER: Unrecognized atom type: S_5+4 (26)
[11:22:46] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:55] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:55] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:55] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:57] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:58] UFFTYPER: Unrecognized atom type: S_5+4 (2)
[11:22:58] UFFTYPER: Unrecognized atom type: S_5+4 (3)
[11:22:58] UFFTYPER: Unrecognized atom type: S_5+4 (26)
[11:22:

In [10]:
df3.to_csv("fingerprints_data.csv", index=False)

In [11]:
df3.head()

Unnamed: 0,Smiles,Standard Type,Standard Value,Morgan_0,Morgan_1,Morgan_2,Morgan_3,Morgan_4,Morgan_5,Morgan_6,...,e3fp_1014,e3fp_1015,e3fp_1016,e3fp_1017,e3fp_1018,e3fp_1019,e3fp_1020,e3fp_1021,e3fp_1022,e3fp_1023
0,CCC(=O)c1cc(C2(c3cccc(-c4cccnc4F)c3)N=C(N)N(C)...,IC50,70.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,CN1C(=O)C(c2ccsc2)(c2ccc(F)c(-c3cccnc3F)c2)N=C1N,IC50,410.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CCn1cc(C2(c3ccc(F)c(-c4cccnc4F)c3)N=C(N)N(C)C2...,IC50,870.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CC(C)CCNc1ccc(F)c([C@]2(C)COCC(N)=N2)c1,IC50,605.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CC1(c2cccc(Nc3cccc4oc(C5CC5)nc34)c2)COCC(N)=N1,IC50,300.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [12]:
df3.shape

(1654, 3241)