# RDkitでフィンガープリントを算出する

必要なライブラリをインポートする

In [24]:
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

関数を定義

In [47]:
def mol2morganfp(mol, radius, nBits):
    bitInfo={}
    fp = AllChem.GetMorganFingerprintAsBitVect(
        mol,
        radius=radius,
        nBits=nBits,
        bitInfo=bitInfo
        )
    return fp, bitInfo


def fp2img(mol, radius, nBits):
    fp, bitInfo = mol2morganfp(mol, radius, nBits)
    bitIds = list(fp.GetOnBits())
    results = [Draw.MolToImage(mol)]
    for bitId in bitIds:
        results.append(Draw.DrawMorganBit(mol, bitInfo=bitInfo, bitId=bitId))
    return results

サンプルデータの読み込み

In [46]:
# サンプルSMILESを取得
smiles = pd.read_csv("water_solubility.csv").SMILES.tolist()
# molを生成する
mols = [Chem.MolFromSmiles(smi) for smi in smiles if Chem.MolFromSmiles(smi) is not None]

RDKit ERROR: [23:02:21] Explicit valence for atom # 6 O, 3, is greater than permitted


In [65]:
morgan_vec = [np.array(mol2morganfp(mol, 2, 1024)[0]) for mol in mols]
df_morgan = pd.DataFrame(morgan_vec)
df_morgan.insert(0, "SMILES", [Chem.MolToSmiles(mol) for mol in mols])

In [66]:
df_morgan

Unnamed: 0,SMILES,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,CC(N)=O,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CNN,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CC(=O)O,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,C1CCNC1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NC(=O)NO,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1284,Clc1ccc(Cl)c(-c2c(Cl)ccc(Cl)c2Cl)c1Cl,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1285,c1ccc2c(c1)ccc1cc3c(ccc4ccccc43)cc12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1286,c1cc2ccc3ccc4ccc5cccc6c(c1)c2c3c4c56,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1287,Clc1cc(-c2c(Cl)c(Cl)c(Cl)c(Cl)c2Cl)c(Cl)c(Cl)c1Cl,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
