# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [48]:
import os
import pandas as pd
from rdkit.Chem import PandasTools
import numpy as np
from joblib import Parallel, delayed
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colors as mcolors
import numpy
from PIL import Image
import subprocess
from rdkit.Chem import AllChem
from typing import List, Tuple
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

### Define directories
***

In [49]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/bit_flipping_nn')



### Load Parecoxib, Celecoxib, Cimicoxib and Deracoxib, Anitrazafen Data
***

In [50]:
# Read COX2_SMILES.csv
parecoxib_df = pd.read_parquet(os.path.join(INPUT_DIR, 'parecoxib_flipped_df.parquet'))
print(f"Parecoxib shape: {parecoxib_df.shape}")

celecoxib_df = pd.read_parquet(os.path.join(INPUT_DIR, 'celecoxib_flipped_df.parquet'))
print(f"Celecoxib shape: {celecoxib_df.shape}")

cimicoxib_df = pd.read_parquet(os.path.join(INPUT_DIR, 'cimicoxib_flipped_df.parquet'))
print(f"Cimicoxib shape: {cimicoxib_df.shape}")

deracoxib_df = pd.read_parquet(os.path.join(INPUT_DIR, 'deracoxib_flipped_df.parquet'))
print(f"Deracoxib shape: {deracoxib_df.shape}")

anitrazafen_df = pd.read_parquet(os.path.join(INPUT_DIR, 'anitrazafen_flipped_df.parquet'))
print(f"Anitrazafen shape: {anitrazafen_df.shape}")


Parecoxib shape: (2048, 9)
Celecoxib shape: (2048, 9)
Cimicoxib shape: (2048, 9)
Deracoxib shape: (2048, 9)
Anitrazafen shape: (2048, 9)


In [51]:
parecoxib_df.head()

Unnamed: 0,title,SMILES,FingerprintBits,SparseFingerprintBits,FingerprintBitsFlipped,SparseFingerprintBitsFlipped,FlippedBitPosition,Generated_SMILES,Device_Used
0,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[0, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",0,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda
1,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[1, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",1,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda
2,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[2, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",2,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda
3,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[3, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",3,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda
4,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[4, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",4,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda


### Validate generated SMILES
***

In [52]:
# Function to validate generated SMILES
def smiles_to_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None, mol

In [53]:
# Validate generated SMILES
parecoxib_df['Valid_SMILES'], parecoxib_df['Molecule'] = zip(*parecoxib_df['Generated_SMILES'].apply(lambda x: smiles_to_mol(x)))
celecoxib_df['Valid_SMILES'], celecoxib_df['Molecule'] = zip(*celecoxib_df['Generated_SMILES'].apply(lambda x: smiles_to_mol(x)))
cimicoxib_df['Valid_SMILES'], cimicoxib_df['Molecule'] = zip(*cimicoxib_df['Generated_SMILES'].apply(lambda x: smiles_to_mol(x)))
deracoxib_df['Valid_SMILES'], deracoxib_df['Molecule'] = zip(*deracoxib_df['Generated_SMILES'].apply(lambda x: smiles_to_mol(x)))
anitrazafen_df['Valid_SMILES'], anitrazafen_df['Molecule'] = zip(*anitrazafen_df['Generated_SMILES'].apply(lambda x: smiles_to_mol(x)))

[19:21:27] SMILES Parse Error: syntax error while parsing: Invalid
[19:21:27] SMILES Parse Error: Failed parsing SMILES 'Invalid' for input: 'Invalid'


In [54]:
def calculate_valid_smiles_percentage(df_dict):
    results = []
    
    for compound, df in df_dict.items():
        valid_smiles = df['Valid_SMILES'].sum()
        percentage = df['Valid_SMILES'].mean() * 100
        results.append({
            'Compound': compound,
            'Valid_SMILES': valid_smiles,
            'Percentage': percentage
        })
    
    return pd.DataFrame(results)

In [55]:
df_dict = {
    'Parecoxib': parecoxib_df,
    'Celecoxib': celecoxib_df,
    'Cimicoxib': cimicoxib_df,
    'Deracoxib': deracoxib_df,
    'Anitrazafen': anitrazafen_df
}
valid_smiles_percentage = calculate_valid_smiles_percentage(df_dict)
valid_smiles_percentage

Unnamed: 0,Compound,Valid_SMILES,Percentage
0,Parecoxib,2048,100.0
1,Celecoxib,2048,100.0
2,Cimicoxib,2048,100.0
3,Deracoxib,2047,99.951172
4,Anitrazafen,2048,100.0


### Tanimoto Similarity
***

In [56]:
def calculate_tanimoto_safe(fp_bits1, fp_bits2):
    """
    Calculate Tanimoto similarity between two fingerprint bit vectors.
    """
    try:
        # Convert bit vectors to numpy arrays if they aren't already
        fp_bits1 = np.array(fp_bits1)
        fp_bits2 = np.array(fp_bits2)
        
        # Verify that both fingerprints have the same length
        if len(fp_bits1) != len(fp_bits2):
            print(f"Fingerprint lengths don't match: {len(fp_bits1)} vs {len(fp_bits2)}")
            return None
            
        # Calculate intersection (AND) and union (OR)
        intersection = np.sum(fp_bits1 & fp_bits2)
        union = np.sum(fp_bits1 | fp_bits2)
        
        # Avoid division by zero
        if union == 0:
            print("Warning: Union of fingerprints is zero")
            return 0.0
            
        # Calculate Tanimoto similarity
        tanimoto = intersection / union
        
        return float(tanimoto)
        
    except Exception as e:
        print(f"Error calculating Tanimoto similarity: {e}")
        return None

In [57]:
parecoxib_df

Unnamed: 0,title,SMILES,FingerprintBits,SparseFingerprintBits,FingerprintBitsFlipped,SparseFingerprintBitsFlipped,FlippedBitPosition,Generated_SMILES,Device_Used,Valid_SMILES,Molecule
0,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[0, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",0,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bf436c120>
1,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[1, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",1,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bf436c3c0>
2,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[2, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",2,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bf436c2e0>
3,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[3, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",3,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bf436c270>
4,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[4, 41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917]",4,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bf436c350>
...,...,...,...,...,...,...,...,...,...,...,...
2043,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917, 2043]",2043,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bb39c89e0>
2044,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917, 2044]",2044,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bb39c8a50>
2045,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917, 2045]",2045,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bb39c8ac0>
2046,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[41, 80, 179, 248, 294, 307, 319, 323, 331, 350, 361, 378, 389, 461, 471, 476, 502, 624, 650, 656, 715, 736, 807, 835, 883, 896, 898, 1045, 1057, 1088, 1141, 1152, 1160, 1199, 1366, 1380, 1476, 1542, 1722, 1746, 1747, 1750, 1873, 1917, 2046]",2046,CCC(=O)NS(=O)(=O)c1ccc(-c2c(-c3ccccc3)noc2C)cc1,cuda,True,<rdkit.Chem.rdchem.Mol object at 0x7f3bb39c8b30>


In [58]:
parecoxib_df['Tanimoto'] = parecoxib_df.apply(lambda row: calculate_tanimoto_safe(row['FingerprintBits'], row['FingerprintBitsFlipped']), axis=1)
celecoxib_df['Tanimoto'] = celecoxib_df.apply(lambda row: calculate_tanimoto_safe(row['FingerprintBits'], row['FingerprintBitsFlipped']), axis=1)
cimicoxib_df['Tanimoto'] = cimicoxib_df.apply(lambda row: calculate_tanimoto_safe(row['FingerprintBits'], row['FingerprintBitsFlipped']), axis=1)
deracoxib_df['Tanimoto'] = deracoxib_df.apply(lambda row: calculate_tanimoto_safe(row['FingerprintBits'], row['FingerprintBitsFlipped']), axis=1)
anitrazafen_df['Tanimoto'] = anitrazafen_df.apply(lambda row: calculate_tanimoto_safe(row['FingerprintBits'], row['FingerprintBitsFlipped']), axis=1)


In [59]:
average_similarity_parecoxib = parecoxib_df['Tanimoto'].mean()
average_similarity_celecoxib = celecoxib_df['Tanimoto'].mean()
average_similarity_cimicoxib = cimicoxib_df['Tanimoto'].mean()
average_similarity_deracoxib = deracoxib_df['Tanimoto'].mean()
average_similarity_anitrazafen = anitrazafen_df['Tanimoto'].mean()
print(f"Average Tanimoto Similarity: {average_similarity_parecoxib}")
print(f"Average Tanimoto Similarity: {average_similarity_celecoxib}")
print(f"Average Tanimoto Similarity: {average_similarity_cimicoxib}")
print(f"Average Tanimoto Similarity: {average_similarity_deracoxib}")
print(f"Average Tanimoto Similarity: {average_similarity_anitrazafen}")

Average Tanimoto Similarity: 0.9777669270833333
Average Tanimoto Similarity: 0.9755978467987809
Average Tanimoto Similarity: 0.9791564941406248
Average Tanimoto Similarity: 0.9795818718112244
Average Tanimoto Similarity: 0.9629448784722224


### Compare canonical SMILES
***

In [60]:
def compare_canonical_smiles(smiles1, smiles2):
    try:
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        
        if mol1 is None:
            return {'match': False, 'reason': f"Invalid SMILES 1: {smiles1}"}
        if mol2 is None:
            return {'match': False, 'reason': f"Invalid SMILES 2: {smiles2}"}
        
        canonical_smiles1 = Chem.MolToSmiles(mol1, isomericSmiles=True, canonical=True)
        canonical_smiles2 = Chem.MolToSmiles(mol2, isomericSmiles=True, canonical=True)
        
        if canonical_smiles1 == canonical_smiles2:
            return {'match': True, 'canonical_smiles': canonical_smiles1}
        else:
            return {'match': False, 'reason': "Canonical SMILES do not match",
                    'canonical_smiles1': canonical_smiles1,
                    'canonical_smiles2': canonical_smiles2}
    except Exception as e:
        return {'match': False, 'reason': f"Error during comparison: {str(e)}"}

In [61]:
parecoxib_df['Canonical_Match'] = parecoxib_df.apply(lambda row: compare_canonical_smiles(row['SMILES'], row['Generated_SMILES']), axis=1)
celecoxib_df['Canonical_Match'] = celecoxib_df.apply(lambda row: compare_canonical_smiles(row['SMILES'], row['Generated_SMILES']), axis=1)
cimicoxib_df['Canonical_Match'] = cimicoxib_df.apply(lambda row: compare_canonical_smiles(row['SMILES'], row['Generated_SMILES']), axis=1)
deracoxib_df['Canonical_Match'] = deracoxib_df.apply(lambda row: compare_canonical_smiles(row['SMILES'], row['Generated_SMILES']), axis=1)
anitrazafen_df['Canonical_Match'] = anitrazafen_df.apply(lambda row: compare_canonical_smiles(row['SMILES'], row['Generated_SMILES']), axis=1)

[19:21:29] SMILES Parse Error: syntax error while parsing: Invalid
[19:21:29] SMILES Parse Error: Failed parsing SMILES 'Invalid' for input: 'Invalid'


In [62]:
canonical_matches_parecoxib = parecoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()
canonical_matches_celecoxib = celecoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()
canonical_matches_cimicoxib = cimicoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()
canonical_matches_deracoxib = deracoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()
canonical_matches_anitrazafen = anitrazafen_df['Canonical_Match'].apply(lambda x: x['match']).sum()
print("**** Parecoxib ****")
print(f"Canonical SMILES Match Accuracy: {canonical_matches_parecoxib / len(parecoxib_df):.2%}")
print(f"Number of Canonical SMILES not matching: {parecoxib_df['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {parecoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()}")
print("**** Celecoxib ****")
print(f"Canonical SMILES Match Accuracy: {canonical_matches_celecoxib / len(celecoxib_df):.2%}")
print(f"Number of Canonical SMILES not matching: {celecoxib_df['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {celecoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()}")
print("**** Cimicoxib ****")
print(f"Canonical SMILES Match Accuracy: {canonical_matches_cimicoxib / len(cimicoxib_df):.2%}")
print(f"Number of Canonical SMILES not matching: {cimicoxib_df['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {cimicoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()}")
print("**** Deracoxib ****")
print(f"Canonical SMILES Match Accuracy: {canonical_matches_deracoxib / len(deracoxib_df):.2%}")
print(f"Number of Canonical SMILES not matching: {deracoxib_df['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {deracoxib_df['Canonical_Match'].apply(lambda x: x['match']).sum()}")
print("**** Anitrazafen ****")
print(f"Canonical SMILES Match Accuracy: {canonical_matches_anitrazafen / len(anitrazafen_df):.2%}")
print(f"Number of Canonical SMILES not matching: {anitrazafen_df['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {anitrazafen_df['Canonical_Match'].apply(lambda x: x['match']).sum()}")


**** Parecoxib ****
Canonical SMILES Match Accuracy: 99.37%
Number of Canonical SMILES not matching: 13
Number of Canonical SMILES matching: 2035
**** Celecoxib ****
Canonical SMILES Match Accuracy: 99.90%
Number of Canonical SMILES not matching: 2
Number of Canonical SMILES matching: 2046
**** Cimicoxib ****
Canonical SMILES Match Accuracy: 99.85%
Number of Canonical SMILES not matching: 3
Number of Canonical SMILES matching: 2045
**** Deracoxib ****
Canonical SMILES Match Accuracy: 99.90%
Number of Canonical SMILES not matching: 2
Number of Canonical SMILES matching: 2046
**** Anitrazafen ****
Canonical SMILES Match Accuracy: 99.32%
Number of Canonical SMILES not matching: 14
Number of Canonical SMILES matching: 2034


### Compare raw SMILES
***

In [63]:
raw_smiles_matches_parecoxib = parecoxib_df.apply(lambda row: row['SMILES'] == row['Generated_SMILES'], axis=1)
print("**** Parecoxib ****")
print(f"Raw SMILES Match Accuracy: {raw_smiles_matches_parecoxib.mean():.2%}")
print(f"Number of Raw SMILES not matching: {(~raw_smiles_matches_parecoxib).sum()}")
print(f"Number of Raw SMILES matching: {raw_smiles_matches_parecoxib.sum()}")

raw_smiles_matches_celecoxib = celecoxib_df.apply(lambda row: row['SMILES'] == row['Generated_SMILES'], axis=1)
print("**** Celecoxib ****")
print(f"Raw SMILES Match Accuracy: {raw_smiles_matches_celecoxib.mean():.2%}")
print(f"Number of Raw SMILES not matching: {(~raw_smiles_matches_celecoxib).sum()}")
print(f"Number of Raw SMILES matching: {raw_smiles_matches_celecoxib.sum()}")

raw_smiles_matches_cimicoxib = cimicoxib_df.apply(lambda row: row['SMILES'] == row['Generated_SMILES'], axis=1)
print("**** Cimicoxib ****")
print(f"Raw SMILES Match Accuracy: {raw_smiles_matches_cimicoxib.mean():.2%}")
print(f"Number of Raw SMILES not matching: {(~raw_smiles_matches_cimicoxib).sum()}")
print(f"Number of Raw SMILES matching: {raw_smiles_matches_cimicoxib.sum()}")

raw_smiles_matches_deracoxib = deracoxib_df.apply(lambda row: row['SMILES'] == row['Generated_SMILES'], axis=1)
print("**** Deracoxib ****")
print(f"Raw SMILES Match Accuracy: {raw_smiles_matches_deracoxib.mean():.2%}")
print(f"Number of Raw SMILES not matching: {(~raw_smiles_matches_deracoxib).sum()}")
print(f"Number of Raw SMILES matching: {raw_smiles_matches_deracoxib.sum()}")

raw_smiles_matches_anitrazafen = anitrazafen_df.apply(lambda row: row['SMILES'] == row['Generated_SMILES'], axis=1)
print("**** Anitrazafen ****")
print(f"Raw SMILES Match Accuracy: {raw_smiles_matches_anitrazafen.mean():.2%}")
print(f"Number of Raw SMILES not matching: {(~raw_smiles_matches_anitrazafen).sum()}")


**** Parecoxib ****
Raw SMILES Match Accuracy: 0.00%
Number of Raw SMILES not matching: 2048
Number of Raw SMILES matching: 0
**** Celecoxib ****
Raw SMILES Match Accuracy: 0.00%
Number of Raw SMILES not matching: 2048
Number of Raw SMILES matching: 0
**** Cimicoxib ****
Raw SMILES Match Accuracy: 0.00%
Number of Raw SMILES not matching: 2048
Number of Raw SMILES matching: 0
**** Deracoxib ****
Raw SMILES Match Accuracy: 0.00%
Number of Raw SMILES not matching: 2048
Number of Raw SMILES matching: 0
**** Anitrazafen ****
Raw SMILES Match Accuracy: 0.00%
Number of Raw SMILES not matching: 2048


### Plot the nearest neighbors
***

In [64]:
def calculate_fingerprint_similarity_matrix(original_bits, flipped_bits):
    """Calculate similarity matrix comparing original and flipped fingerprint bits."""
    all_bits = [original_bits] + flipped_bits
    n = len(all_bits)
    similarity_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            # Convert bit strings to numpy arrays for comparison
            bits_i = np.array([int(b) for b in all_bits[i]])
            bits_j = np.array([int(b) for b in all_bits[j]])
            
            # Calculate Tanimoto similarity directly from bit vectors
            intersection = np.sum(np.logical_and(bits_i, bits_j))
            union = np.sum(np.logical_or(bits_i, bits_j))
            similarity_matrix[i, j] = intersection / union if union > 0 else 0
            
    return similarity_matrix

def plot_mds_similarity(df, title_column='title'):
    """Plot MDS similarity using original and flipped fingerprint bits."""
    if df.empty:
        print(f"Warning: Empty DataFrame for {title_column}")
        return
        
    try:
        # Store original index
        if 'original_index' not in df.columns:
            df['original_index'] = df.index

        # Get original and flipped fingerprint bits
        original_bits = df['FingerprintBits'].iloc[0]
        flipped_bits = df['FingerprintBitsFlipped'].tolist()
        
        if not flipped_bits:
            print(f"Warning: No flipped bits found for {df[title_column].iloc[0]}")
            return
            
        # Calculate similarity matrix
        similarity_matrix = calculate_fingerprint_similarity_matrix(original_bits, flipped_bits)

        # Apply MDS
        mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
        coords = mds.fit_transform(1 - similarity_matrix)

        # Calculate similarities to original fingerprint
        similarities_to_original = similarity_matrix[0, 1:]

        # Create plot
        plt.figure(figsize=(14, 10))
        
        # Plot flipped fingerprints
        scatter = plt.scatter(coords[1:, 0], coords[1:, 1], 
                            c=similarities_to_original, 
                            cmap='viridis', 
                            alpha=0.6, 
                            s=100)
        
        # Plot original fingerprint
        plt.scatter(coords[0, 0], coords[0, 1], 
                   c='red', 
                   s=200, 
                   label='Original', 
                   edgecolors='black')

        # Add color bar and labels
        cbar = plt.colorbar(scatter)
        cbar.set_label('Tanimoto Similarity to Original', rotation=270, labelpad=20)

        for i in range(len(coords)):
            label = "Original" if i == 0 else f"{df['original_index'].iloc[i-1]}"
            plt.annotate(label, 
                        (coords[i, 0], coords[i, 1]), 
                        xytext=(5, 5),
                        textcoords='offset points', 
                        fontsize=8)

        plt.title(f"2D Projection of Single-Bit Flips for {df[title_column].iloc[0]}")
        plt.xlabel("MDS Dimension 1")
        plt.ylabel("MDS Dimension 2")
        plt.legend()
        plt.tight_layout()
        
        filename = f'{df[title_column].iloc[0].lower()}_single_bit_flips_mds_similarity.png'
        plt.savefig(os.path.join(INPUT_DIR, filename))
        plt.close()
        
        return True
        
    except Exception as e:
        print(f"Error processing {df[title_column].iloc[0] if not df.empty else 'unknown'}: {str(e)}")
        plt.close()
        return False
    
def process_and_plot(df):
    """Process dataframe and generate plot with error handling."""
    try:
        success = plot_mds_similarity(df)
        if success:
            print(f"MDS plot saved for {df['title'].iloc[0]} with single-bit flips.")
        else:
            print(f"Failed to create MDS plot for single-bit flips.")
    except Exception as e:
        print(f"Error in process_and_plot: {str(e)}")

In [65]:
process_and_plot(parecoxib_df)
process_and_plot(celecoxib_df)
process_and_plot(cimicoxib_df)
process_and_plot(deracoxib_df)
process_and_plot(anitrazafen_df)

MDS plot saved for Parecoxib with single-bit flips.
MDS plot saved for Celecoxib with single-bit flips.
MDS plot saved for Cimicoxib with single-bit flips.
MDS plot saved for Deracoxib with single-bit flips.
MDS plot saved for Anitrazafen with single-bit flips.


In [66]:
def plot_tsne_similarity(df, title_column='title'):
    """Plot t-SNE similarity using original and flipped fingerprint bits."""
    if df.empty:
        print(f"Warning: Empty DataFrame for {title_column}")
        return False
        
    try:
        # Store original index
        if 'original_index' not in df.columns:
            df['original_index'] = df.index

        # Get original and flipped fingerprint bits
        original_bits = df['FingerprintBits'].iloc[0]
        flipped_bits = df['FingerprintBitsFlipped'].tolist()
        
        if not flipped_bits:
            print(f"Warning: No flipped bits found for {df[title_column].iloc[0]}")
            return False
            
        # Calculate similarity matrix using the same method as MDS
        similarity_matrix = calculate_fingerprint_similarity_matrix(original_bits, flipped_bits)

        # Adjust perplexity based on number of samples
        n_samples = len(flipped_bits) + 1  # +1 for original
        perplexity = min(30, max(5, n_samples - 1))
        
        # Apply t-SNE
        try:
            tsne = TSNE(n_components=2, 
                       metric='precomputed', 
                       random_state=42,
                       perplexity=perplexity, 
                       init='random', 
                       learning_rate='auto')
            coords = tsne.fit_transform(1 - similarity_matrix)
        except ValueError as e:
            print(f"t-SNE error for {df[title_column].iloc[0]}: {str(e)}")
            return False

        # Calculate similarities to original fingerprint
        similarities_to_original = similarity_matrix[0, 1:]

        # Create plot
        plt.figure(figsize=(14, 10))
        
        # Plot flipped fingerprints
        scatter = plt.scatter(coords[1:, 0], coords[1:, 1], 
                            c=similarities_to_original, 
                            cmap='viridis', 
                            alpha=0.6, 
                            s=100)
        
        # Plot original fingerprint
        plt.scatter(coords[0, 0], coords[0, 1], 
                   c='red', 
                   s=200, 
                   label='Original', 
                   edgecolors='black')

        # Add color bar and labels
        cbar = plt.colorbar(scatter)
        cbar.set_label('Tanimoto Similarity to Original', rotation=270, labelpad=20)

        for i in range(len(coords)):
            label = "Original" if i == 0 else f"{df['original_index'].iloc[i-1]}"
            plt.annotate(label, 
                        (coords[i, 0], coords[i, 1]), 
                        xytext=(5, 5),
                        textcoords='offset points', 
                        fontsize=8)

        plt.title(f"2D Projection of Single-Bit Flips for {df[title_column].iloc[0]}")
        plt.xlabel("t-SNE Dimension 1")
        plt.ylabel("t-SNE Dimension 2")
        plt.legend()
        plt.tight_layout()
        
        filename = f'{df[title_column].iloc[0].lower()}_single_bit_flips_tsne_similarity.png'
        plt.savefig(os.path.join(INPUT_DIR, filename))
        plt.close()
        
        return True
        
    except Exception as e:
        print(f"Error processing {df[title_column].iloc[0] if not df.empty else 'unknown'}: {str(e)}")
        plt.close()
        return False

def process_and_plot_tsne(df):
    """Process dataframe and generate t-SNE plot with error handling."""
    try:
        success = plot_tsne_similarity(df)
        if success:
            print(f"t-SNE plot saved for {df['title'].iloc[0]} with single-bit flips.")
        else:
            print(f"Failed to create t-SNE plot for single-bit flips.")
    except Exception as e:
        print(f"Error in process_and_plot_tsne: {str(e)}")

In [67]:
process_and_plot_tsne(parecoxib_df)
process_and_plot_tsne(celecoxib_df)
process_and_plot_tsne(cimicoxib_df)
process_and_plot_tsne(deracoxib_df)
process_and_plot_tsne(anitrazafen_df)

t-SNE plot saved for Parecoxib with single-bit flips.
t-SNE plot saved for Celecoxib with single-bit flips.
t-SNE plot saved for Cimicoxib with single-bit flips.
t-SNE plot saved for Deracoxib with single-bit flips.
t-SNE plot saved for Anitrazafen with single-bit flips.
