In [4]:
from rdkit.Chem import AllChem, Descriptors, Draw
from rdkit import Chem
from rdkit import DataStructs
import numpy as np
import os
import pickle
from openbabel import openbabel as ob
from openbabel import pybel as pb

In [5]:
def generate_morgan_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=2048)  # Morgan fingerprint radius=2, 1024-bit vector
    return fp

In [6]:
def calculate_tanimoto_similarity(fp1, fp2):
    if fp1 is None or fp2 is None:
        return None
    similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
    return similarity



In [None]:
# zinc fingerprint
zinc_dir = 'zinc'
zinc_sub_dir = os.listdir(zinc_dir)

f_out = open(os.path.join('anshul', 'fingerprints', 'morgan', 'zinc.txt'), 'w')

for sub_dir_name in zinc_sub_dir:
    files_name = os.listdir(os.path.join(zinc_dir, sub_dir_name))

    for f_name in files_name:
        f = open(os.path.join(zinc_dir, sub_dir_name, f_name), 'r')
        lines = f.readlines()

        for i in range(len(lines)):
            line = lines[i]
            if len(line.split()) > 1:
                try: 
                    fp = generate_morgan_fp(line.split()[0])
                    if fp != None:
                        f_out.write(f"{sub_dir_name}_{f_name}_{i}:::{pickle.dumps(fp)}\n")
                except Exception as e:
                    print(e)

        f_out.flush()

f_out.close()         



In [7]:
# drugbank fingerprint

db_dir = 'drugs21'
db_files = os.listdir(db_dir)
db_files = [name for name in db_files if name.endswith("mol2")]



result = []

for f_name in db_files:
    genrtor = pb.readfile('mol2', os.path.join(db_dir, f_name))
    count = 0

    for mol in genrtor:
        try: 
            fp = generate_morgan_fp(mol.write('smi'))
            if fp != None:
                result.append((f"{f_name}_{count}", fp))
            
            count += 1
        except Exception as e:
            count += 1
            print(e)


f_out = open(os.path.join('anshul', 'fingerprints', 'morgan', 'drugbank.txt'), 'wb')
f_out.write(pickle.dumps(result))
f_out.close()         



[11:30:51] Explicit valence for atom # 4 N, 4, is greater than permitted
[11:30:51] Explicit valence for atom # 12 N, 4, is greater than permitted
[11:30:51] Explicit valence for atom # 7 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in MOL2 file (title is 4628823)

[11:30:51] Explicit valence for atom # 19 N, 4, is greater than permitted
[11:30:51] Explicit valence for atom # 9 N, 4, is greater than permitted
[11:30:51] Explicit valence for atom # 3 N, 4, is greater than permitted
[11:30:51] Can't kekulize mol.  Unkekulized atoms: 4 5 6 8 10
  Failed to kekulize aromatic bonds in MOL2 file (title is 5168640)

[11:30:51] Explicit valence for atom # 19 N, 4, is greater than permitted
[11:30:51] Explicit valence for atom # 16 N, 4, is greater than permitted
[11:30:51] Explicit valence for atom # 29 N, 4, is greater than permitted
[11:30:51] Can't kekulize mol.  Unkekulized atoms: 30 31 32 33 34
  Failed to kekulize aromatic bonds in MOL2 file (title is 165491)

[11:

In [9]:
# pdb(rcsb) fingerprint

pdb_dir = 'pdb/ligands_pdbqt'
pdb_files = os.listdir(pdb_dir)
pdb_files = [name for name in pdb_files if name.endswith("pdbqt")]

result = []
for f_name in pdb_files:
    genrtor = pb.readfile('pdbqt', os.path.join(pdb_dir, f_name))
    count = 0

    for mol in genrtor:
        try: 
            fp = generate_morgan_fp(mol.write('smi'))
            if fp != None:
                result.append((f"{f_name}_{count}", fp))            
            count += 1
        except Exception as e:
            count += 1
            print(e)



f_out = open(os.path.join('anshul', 'fingerprints', 'morgan', 'pdb.txt'), 'wb')
f_out.write(pickle.dumps(result))
f_out.close()         



[11:33:41] Explicit valence for atom # 1 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 10 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 27 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 6 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 33 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 6 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 8 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 2 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 12 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is E9D)

[11:33:41] Explicit valence for atom # 9 N, 4, is greater than permitted
[11:33:41] Explicit valence for atom # 0 C, 5, is greater than permitted
[11:33:41] Explicit valence for atom # 4 C, 5, is greater than permitted
  Failed to kekulize aromatic bonds in 

In [None]:
# 

In [20]:
x1 = generate_morgan_fp("S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1")
#x2 = generate_morgan_fp("N1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)CN1C(CC(CC1(C)C)CC(=O)O)(C)C")

[14:12:57] Explicit valence for atom # 22 S, 7, is greater than permitted


In [25]:
x2 = generate_morgan_fp("N1C(CC(CC1(C)C)CC(=O)O)(C)C")

In [26]:
sum(x2.ToList())

23

In [28]:
pickle.dumps(x2)

b'\x80\x04\x95g\x00\x00\x00\x00\x00\x00\x00\x8c\x1erdkit.DataStructs.cDataStructs\x94\x8c\x0fExplicitBitVect\x94\x93\x94C%\xe0\xff\xff\xff\x00\x04\x00\x00\x17\x00\x00\x00:\x06\x04V\x12~\xbe\x8c\x86r1\x00z\x0c*\x18(\x10\xa8h@@\x9a\x1c\x08\x94\x85\x94R\x94}\x94\x85\x94b.'

In [30]:
import time

t1 = time.time()

for i in range(100000):
    #calculate_tanimoto_similarity(x1, x2)
    generate_morgan_fp("S1(=O)(=O)CC(CC1)NCCn1c(c2cn(nc2)CC)cnc1")

print(time.time() - t1)

39.952165842056274


In [9]:
l = [0, 1, 0, 1]

print(str(l))

[0, 1, 0, 1]


In [7]:
os.listdir()

['anshul',
 'drugs21',
 'zinc_smiles',
 'realdb',
 'pdb',
 'pubchem',
 'impact',
 'prateek',
 'anshul_2']

In [19]:
sum(x1.ToList())

49

Exception: target argument is not specified