# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [12]:
import sentencepiece as spm
import os
import zipfile
import pandas as pd
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import PandasTools
import numpy as np
from joblib import Parallel, delayed
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colors as mcolors
import numpy
from PIL import Image
import subprocess
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator


In [13]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/processed_chunks')


In [14]:
df = pd.read_parquet(os.path.join(INPUT_DIR, 'processed_molecule_fingerprints_part_0.parquet'))

In [15]:
df.head()

Unnamed: 0,ChEMBL ID,smiles,FingerprintBits,SparseFingerprintBits,Converted_SMILES,Device_Used
0,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",80 191 216 235 255 263 310 363 378 380 467 531...,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)C...,cuda
1,CHEMBL3935110,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",80 191 216 235 249 255 263 310 363 378 380 467...,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4...,cuda
2,CHEMBL3347413,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",24 80 102 216 240 301 305 314 322 378 437 499 ...,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,cuda
3,CHEMBL1739263,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",82 140 162 338 441 621 623 656 675 807 854 876...,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,cuda
4,CHEMBL3917493,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...",12 14 31 63 74 119 184 191 241 378 383 391 486...,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH...,cuda


In [16]:
# Define a function to calculate Tanimoto similarity with debugging
def calculate_tanimoto_safe(smiles1, smiles2):
    try:
        # Skip if either SMILES is marked as 'Invalid SMILES string'
        if smiles1 == 'Invalid SMILES string' or smiles2 == 'Invalid SMILES string':
            return None
        
        # Convert SMILES to molecule objects
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        
        # Debugging information
        if mol1 is None:
            print(f"Failed to parse SMILES1: {smiles1}")
        if mol2 is None:
            print(f"Failed to parse SMILES2: {smiles2}")
        
        # Skip if either molecule failed to parse
        if mol1 is None or mol2 is None:
            return None
        
        # Generate fingerprints for Tanimoto calculation
        morgan_gen = GetMorganGenerator(radius=2, fpSize=2048)
        fp1 = morgan_gen.GetFingerprint(mol1)
        fp2 = morgan_gen.GetFingerprint(mol2)
        
        # Debugging information
        if fp1 is None or fp2 is None:
            print(f"Failed to generate fingerprints for SMILES1: {smiles1}, SMILES2: {smiles2}")
        
        # Calculate Tanimoto similarity
        return TanimotoSimilarity(fp1, fp2)
    
    except Exception as e:
        # Print the exception to help diagnose
        print(f"Error calculating Tanimoto similarity: {e}")
        return None


In [17]:
# Apply the safe calculation function with debugging
df['Tanimoto_Similarity'] = df.apply(
    lambda row: calculate_tanimoto_safe(row['smiles'], row['Converted_SMILES']),
    axis=1
)

In [18]:
df.head()

Unnamed: 0,ChEMBL ID,smiles,FingerprintBits,SparseFingerprintBits,Converted_SMILES,Device_Used,Tanimoto_Similarity
0,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",80 191 216 235 255 263 310 363 378 380 467 531...,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)C...,cuda,1.0
1,CHEMBL3935110,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",80 191 216 235 249 255 263 310 363 378 380 467...,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4...,cuda,1.0
2,CHEMBL3347413,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",24 80 102 216 240 301 305 314 322 378 437 499 ...,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,cuda,1.0
3,CHEMBL1739263,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",82 140 162 338 441 621 623 656 675 807 854 876...,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,cuda,1.0
4,CHEMBL3917493,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...",12 14 31 63 74 119 184 191 241 378 383 391 486...,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH...,cuda,1.0


In [19]:
average_similarity = df['Tanimoto_Similarity'].mean()
print(f"Average Tanimoto Similarity: {average_similarity}")

Average Tanimoto Similarity: 0.9831205878804832


In [20]:
tanimoto_matches = (df['Tanimoto_Similarity'] == 1.0).sum()
total_compounds = len(df)
print(f"Tanimoto Accuracy: {tanimoto_matches / total_compounds:.2%}")

Tanimoto Accuracy: 91.70%


In [21]:
def compare_canonical_smiles(smiles1, smiles2):
    try:
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        
        if mol1 is None:
            return {'match': False, 'reason': f"Invalid SMILES 1: {smiles1}"}
        if mol2 is None:
            return {'match': False, 'reason': f"Invalid SMILES 2: {smiles2}"}
        
        canonical_smiles1 = Chem.MolToSmiles(mol1, isomericSmiles=True, canonical=True)
        canonical_smiles2 = Chem.MolToSmiles(mol2, isomericSmiles=True, canonical=True)
        
        if canonical_smiles1 == canonical_smiles2:
            return {'match': True, 'canonical_smiles': canonical_smiles1}
        else:
            return {'match': False, 'reason': "Canonical SMILES do not match",
                    'canonical_smiles1': canonical_smiles1,
                    'canonical_smiles2': canonical_smiles2}
    except Exception as e:
        return {'match': False, 'reason': f"Error during comparison: {str(e)}"}

In [22]:
# Compare canonical SMILES
df['Canonical_Match'] = df.apply(
    lambda row: compare_canonical_smiles(row['smiles'], row['Converted_SMILES']), 
    axis=1
)

[15:37:52] SMILES Parse Error: syntax error while parsing: Invalid
[15:37:52] SMILES Parse Error: Failed parsing SMILES 'Invalid' for input: 'Invalid'
[15:37:52] SMILES Parse Error: syntax error while parsing: Invalid
[15:37:52] SMILES Parse Error: Failed parsing SMILES 'Invalid' for input: 'Invalid'
[15:37:52] SMILES Parse Error: syntax error while parsing: Invalid
[15:37:52] SMILES Parse Error: Failed parsing SMILES 'Invalid' for input: 'Invalid'
[15:37:52] SMILES Parse Error: syntax error while parsing: Invalid
[15:37:52] SMILES Parse Error: Failed parsing SMILES 'Invalid' for input: 'Invalid'
[15:37:52] SMILES Parse Error: syntax error while parsing: Invalid
[15:37:52] SMILES Parse Error: Failed parsing SMILES 'Invalid' for input: 'Invalid'


In [23]:
df.head()

Unnamed: 0,ChEMBL ID,smiles,FingerprintBits,SparseFingerprintBits,Converted_SMILES,Device_Used,Tanimoto_Similarity,Canonical_Match
0,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",80 191 216 235 255 263 310 363 378 380 467 531...,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)C...,cuda,1.0,"{'match': True, 'canonical_smiles': 'Cc1ccc2c(..."
1,CHEMBL3935110,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",80 191 216 235 249 255 263 310 363 378 380 467...,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4...,cuda,1.0,"{'match': True, 'canonical_smiles': 'Cc1ccc2c(..."
2,CHEMBL3347413,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",24 80 102 216 240 301 305 314 322 378 437 499 ...,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,cuda,1.0,"{'match': True, 'canonical_smiles': 'N#Cc1cc(C..."
3,CHEMBL1739263,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",82 140 162 338 441 621 623 656 675 807 854 876...,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,cuda,1.0,"{'match': True, 'canonical_smiles': 'N=C(N)c1c..."
4,CHEMBL3917493,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...",12 14 31 63 74 119 184 191 241 378 383 391 486...,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH...,cuda,1.0,"{'match': True, 'canonical_smiles': 'O=C(C=Cc1..."


In [24]:
canonical_matches = df['Canonical_Match'].apply(lambda x: x['match']).sum()
print(f"Canonical SMILES Match Accuracy: {canonical_matches / total_compounds:.2%}")
print(f"Number of Canonical SMILES not matching: {df['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {df['Canonical_Match'].apply(lambda x: x['match']).sum()}")

Canonical SMILES Match Accuracy: 88.70%
Number of Canonical SMILES not matching: 113
Number of Canonical SMILES matching: 887
