# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [32]:
import os
import pandas as pd
from rdkit.Chem import PandasTools
import numpy as np
from joblib import Parallel, delayed
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colors as mcolors
import numpy
from PIL import Image
import subprocess
from rdkit.Chem import AllChem
from typing import List, Tuple
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

### Define directories
***

In [33]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/bit_flipping_nn_train_molecule')



### Load Parecoxib, Celecoxib, Cimicoxib and Deracoxib, Anitrazafen Data
***

In [34]:
# Read 
chembl_3897759 = pd.read_parquet(os.path.join(INPUT_DIR, 'chembl_3897759_flipped_df.parquet'))
print(f"chembl_3897759 shape: {chembl_3897759.shape}")

chembl_3347413 = pd.read_parquet(os.path.join(INPUT_DIR, 'chembl_3347413_flipped_df.parquet'))
print(f"chembl_3347413 shape: {chembl_3347413.shape}")


chembl_3897759 shape: (2048, 9)
chembl_3347413 shape: (2048, 9)


In [35]:
chembl_3897759.head()

Unnamed: 0,ChEMBL ID,SMILES,FingerprintBits,SparseFingerprintBits,FingerprintBitsFlipped,SparseFingerprintBitsFlipped,FlippedBitPosition,Generated_SMILES,Device_Used
0,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",80 191 216 235 255 263 310 363 378 380 467 531 548 638 650 689 699 708 744 784 785 807 827 843 875 881 926 933 935 974 1013 1019 1057 1077 1097 1106 1152 1163 1261 1275 1309 1325 1357 1380 1416 1422 1480 1539 1693 1700 1722 1733 1750 1754 1765 1771 1791 1816 1823 1837 1840 1855 1873 1917 1928 2004 2016,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[0, 80, 191, 216, 235, 255, 263, 310, 363, 378, 380, 467, 531, 548, 638, 650, 689, 699, 708, 744, 784, 785, 807, 827, 843, 875, 881, 926, 933, 935, 974, 1013, 1019, 1057, 1077, 1097, 1106, 1152, 1163, 1261, 1275, 1309, 1325, 1357, 1380, 1416, 1422, 1480, 1539, 1693, 1700, 1722, 1733, 1750, 1754, 1765, 1771, 1791, 1816, 1823, 1837, 1840, 1855, 1873, 1917, 1928, 2004, 2016]",0,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,cuda
1,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",80 191 216 235 255 263 310 363 378 380 467 531 548 638 650 689 699 708 744 784 785 807 827 843 875 881 926 933 935 974 1013 1019 1057 1077 1097 1106 1152 1163 1261 1275 1309 1325 1357 1380 1416 1422 1480 1539 1693 1700 1722 1733 1750 1754 1765 1771 1791 1816 1823 1837 1840 1855 1873 1917 1928 2004 2016,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[1, 80, 191, 216, 235, 255, 263, 310, 363, 378, 380, 467, 531, 548, 638, 650, 689, 699, 708, 744, 784, 785, 807, 827, 843, 875, 881, 926, 933, 935, 974, 1013, 1019, 1057, 1077, 1097, 1106, 1152, 1163, 1261, 1275, 1309, 1325, 1357, 1380, 1416, 1422, 1480, 1539, 1693, 1700, 1722, 1733, 1750, 1754, 1765, 1771, 1791, 1816, 1823, 1837, 1840, 1855, 1873, 1917, 1928, 2004, 2016]",1,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,cuda
2,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",80 191 216 235 255 263 310 363 378 380 467 531 548 638 650 689 699 708 744 784 785 807 827 843 875 881 926 933 935 974 1013 1019 1057 1077 1097 1106 1152 1163 1261 1275 1309 1325 1357 1380 1416 1422 1480 1539 1693 1700 1722 1733 1750 1754 1765 1771 1791 1816 1823 1837 1840 1855 1873 1917 1928 2004 2016,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[2, 80, 191, 216, 235, 255, 263, 310, 363, 378, 380, 467, 531, 548, 638, 650, 689, 699, 708, 744, 784, 785, 807, 827, 843, 875, 881, 926, 933, 935, 974, 1013, 1019, 1057, 1077, 1097, 1106, 1152, 1163, 1261, 1275, 1309, 1325, 1357, 1380, 1416, 1422, 1480, 1539, 1693, 1700, 1722, 1733, 1750, 1754, 1765, 1771, 1791, 1816, 1823, 1837, 1840, 1855, 1873, 1917, 1928, 2004, 2016]",2,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,cuda
3,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",80 191 216 235 255 263 310 363 378 380 467 531 548 638 650 689 699 708 744 784 785 807 827 843 875 881 926 933 935 974 1013 1019 1057 1077 1097 1106 1152 1163 1261 1275 1309 1325 1357 1380 1416 1422 1480 1539 1693 1700 1722 1733 1750 1754 1765 1771 1791 1816 1823 1837 1840 1855 1873 1917 1928 2004 2016,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[3, 80, 191, 216, 235, 255, 263, 310, 363, 378, 380, 467, 531, 548, 638, 650, 689, 699, 708, 744, 784, 785, 807, 827, 843, 875, 881, 926, 933, 935, 974, 1013, 1019, 1057, 1077, 1097, 1106, 1152, 1163, 1261, 1275, 1309, 1325, 1357, 1380, 1416, 1422, 1480, 1539, 1693, 1700, 1722, 1733, 1750, 1754, 1765, 1771, 1791, 1816, 1823, 1837, 1840, 1855, 1873, 1917, 1928, 2004, 2016]",3,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,cuda
4,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",80 191 216 235 255 263 310 363 378 380 467 531 548 638 650 689 699 708 744 784 785 807 827 843 875 881 926 933 935 974 1013 1019 1057 1077 1097 1106 1152 1163 1261 1275 1309 1325 1357 1380 1416 1422 1480 1539 1693 1700 1722 1733 1750 1754 1765 1771 1791 1816 1823 1837 1840 1855 1873 1917 1928 2004 2016,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[4, 80, 191, 216, 235, 255, 263, 310, 363, 378, 380, 467, 531, 548, 638, 650, 689, 699, 708, 744, 784, 785, 807, 827, 843, 875, 881, 926, 933, 935, 974, 1013, 1019, 1057, 1077, 1097, 1106, 1152, 1163, 1261, 1275, 1309, 1325, 1357, 1380, 1416, 1422, 1480, 1539, 1693, 1700, 1722, 1733, 1750, 1754, 1765, 1771, 1791, 1816, 1823, 1837, 1840, 1855, 1873, 1917, 1928, 2004, 2016]",4,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,cuda


### Validate generated SMILES
***

In [36]:
# Function to validate generated SMILES
def smiles_to_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None, mol

In [37]:
# Validate generated SMILES
chembl_3897759['Valid_SMILES'], chembl_3897759['Molecule'] = zip(*chembl_3897759['Generated_SMILES'].apply(lambda x: smiles_to_mol(x)))
chembl_3347413['Valid_SMILES'], chembl_3347413['Molecule'] = zip(*chembl_3347413['Generated_SMILES'].apply(lambda x: smiles_to_mol(x)))


In [38]:
def calculate_valid_smiles_percentage(df_dict):
    results = []
    
    for compound, df in df_dict.items():
        valid_smiles = df['Valid_SMILES'].sum()
        percentage = df['Valid_SMILES'].mean() * 100
        results.append({
            'Compound': compound,
            'Valid_SMILES': valid_smiles,
            'Percentage': percentage
        })
    
    return pd.DataFrame(results)

In [39]:
df_dict = {
    'chembl_3897759': chembl_3897759,
    'chembl_3347413': chembl_3347413
}
valid_smiles_percentage = calculate_valid_smiles_percentage(df_dict)
valid_smiles_percentage

Unnamed: 0,Compound,Valid_SMILES,Percentage
0,chembl_3897759,2048,100.0
1,chembl_3347413,2048,100.0


### Tanimoto Similarity
***

In [40]:
def calculate_tanimoto_safe(fp_bits1, fp_bits2):
    """
    Calculate Tanimoto similarity between two fingerprint bit vectors.
    """
    try:
        # Convert bit vectors to numpy arrays if they aren't already
        fp_bits1 = np.array(fp_bits1)
        fp_bits2 = np.array(fp_bits2)
        
        # Verify that both fingerprints have the same length
        if len(fp_bits1) != len(fp_bits2):
            print(f"Fingerprint lengths don't match: {len(fp_bits1)} vs {len(fp_bits2)}")
            return None
            
        # Calculate intersection (AND) and union (OR)
        intersection = np.sum(fp_bits1 & fp_bits2)
        union = np.sum(fp_bits1 | fp_bits2)
        
        # Avoid division by zero
        if union == 0:
            print("Warning: Union of fingerprints is zero")
            return 0.0
            
        # Calculate Tanimoto similarity
        tanimoto = intersection / union
        
        return float(tanimoto)
        
    except Exception as e:
        print(f"Error calculating Tanimoto similarity: {e}")
        return None

In [41]:
chembl_3897759['Tanimoto'] = chembl_3897759.apply(lambda row: calculate_tanimoto_safe(row['FingerprintBits'], row['FingerprintBitsFlipped']), axis=1)
chembl_3347413['Tanimoto'] = chembl_3347413.apply(lambda row: calculate_tanimoto_safe(row['FingerprintBits'], row['FingerprintBitsFlipped']), axis=1)


In [42]:
average_similarity_chembl_3897759 = chembl_3897759['Tanimoto'].mean()
average_similarity_chembl_3347413 = chembl_3347413['Tanimoto'].mean()

print(f"Average Tanimoto Similarity: {average_similarity_chembl_3897759}")
print(f"Average Tanimoto Similarity: {average_similarity_chembl_3347413}")


Average Tanimoto Similarity: 0.9852869370404407
Average Tanimoto Similarity: 0.9814724392361109


### Compare canonical SMILES
***

In [43]:
def compare_canonical_smiles(smiles1, smiles2):
    try:
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        
        if mol1 is None:
            return {'match': False, 'reason': f"Invalid SMILES 1: {smiles1}"}
        if mol2 is None:
            return {'match': False, 'reason': f"Invalid SMILES 2: {smiles2}"}
        
        canonical_smiles1 = Chem.MolToSmiles(mol1, isomericSmiles=True, canonical=True)
        canonical_smiles2 = Chem.MolToSmiles(mol2, isomericSmiles=True, canonical=True)
        
        if canonical_smiles1 == canonical_smiles2:
            return {'match': True, 'canonical_smiles': canonical_smiles1}
        else:
            return {'match': False, 'reason': "Canonical SMILES do not match",
                    'canonical_smiles1': canonical_smiles1,
                    'canonical_smiles2': canonical_smiles2}
    except Exception as e:
        return {'match': False, 'reason': f"Error during comparison: {str(e)}"}

In [44]:
chembl_3897759['Canonical_Match'] = chembl_3897759.apply(lambda row: compare_canonical_smiles(row['SMILES'], row['Generated_SMILES']), axis=1)
chembl_3347413['Canonical_Match'] = chembl_3347413.apply(lambda row: compare_canonical_smiles(row['SMILES'], row['Generated_SMILES']), axis=1)


In [45]:
canonical_matches_chembl_3897759 = chembl_3897759['Canonical_Match'].apply(lambda x: x['match']).sum()
canonical_matches_chembl_3347413 = chembl_3347413['Canonical_Match'].apply(lambda x: x['match']).sum()

print("**** Chembl_3897759 ****")
print(f"Canonical SMILES Match Accuracy: {canonical_matches_chembl_3897759 / len(chembl_3897759):.2%}")
print(f"Number of Canonical SMILES not matching: {chembl_3897759['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {chembl_3897759['Canonical_Match'].apply(lambda x: x['match']).sum()}")
print("**** Chembl_3347413 ****")
print(f"Canonical SMILES Match Accuracy: {canonical_matches_chembl_3347413 / len(chembl_3347413):.2%}")
print(f"Number of Canonical SMILES not matching: {chembl_3347413['Canonical_Match'].apply(lambda x: not x['match']).sum()}")
print(f"Number of Canonical SMILES matching: {chembl_3347413['Canonical_Match'].apply(lambda x: x['match']).sum()}")



**** Chembl_3897759 ****
Canonical SMILES Match Accuracy: 99.76%
Number of Canonical SMILES not matching: 5
Number of Canonical SMILES matching: 2043
**** Chembl_3347413 ****
Canonical SMILES Match Accuracy: 99.02%
Number of Canonical SMILES not matching: 20
Number of Canonical SMILES matching: 2028


### Compare raw SMILES
***

In [46]:
raw_smiles_matches_chembl_3897759 = chembl_3897759.apply(lambda row: row['SMILES'] == row['Generated_SMILES'], axis=1)
print("**** Chembl_3897759 ****")
print(f"Raw SMILES Match Accuracy: {raw_smiles_matches_chembl_3897759.mean():.2%}")
print(f"Number of Raw SMILES not matching: {(~raw_smiles_matches_chembl_3897759).sum()}")
print(f"Number of Raw SMILES matching: {raw_smiles_matches_chembl_3897759.sum()}")

raw_smiles_matches_chembl_3347413 = chembl_3347413.apply(lambda row: row['SMILES'] == row['Generated_SMILES'], axis=1)
print("**** Chembl_3347413 ****")
print(f"Raw SMILES Match Accuracy: {raw_smiles_matches_chembl_3347413.mean():.2%}")
print(f"Number of Raw SMILES not matching: {(~raw_smiles_matches_chembl_3347413).sum()}")
print(f"Number of Raw SMILES matching: {raw_smiles_matches_chembl_3347413.sum()}")


**** Chembl_3897759 ****
Raw SMILES Match Accuracy: 99.76%
Number of Raw SMILES not matching: 5
Number of Raw SMILES matching: 2043
**** Chembl_3347413 ****
Raw SMILES Match Accuracy: 99.02%
Number of Raw SMILES not matching: 20
Number of Raw SMILES matching: 2028


### Plot the nearest neighbors
***

In [47]:
def calculate_fingerprint_similarity_matrix(original_bits, flipped_bits):
    """Calculate similarity matrix comparing original and flipped fingerprint bits."""
    all_bits = [original_bits] + flipped_bits
    n = len(all_bits)
    similarity_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            # Convert bit strings to numpy arrays for comparison
            bits_i = np.array([int(b) for b in all_bits[i]])
            bits_j = np.array([int(b) for b in all_bits[j]])
            
            # Calculate Tanimoto similarity directly from bit vectors
            intersection = np.sum(np.logical_and(bits_i, bits_j))
            union = np.sum(np.logical_or(bits_i, bits_j))
            similarity_matrix[i, j] = intersection / union if union > 0 else 0
            
    return similarity_matrix

def plot_mds_similarity(df, title_column='ChEMBL ID'):
    """Plot MDS similarity using original and flipped fingerprint bits."""
    if df.empty:
        print(f"Warning: Empty DataFrame for {title_column}")
        return
        
    try:
        # Store original index
        if 'original_index' not in df.columns:
            df['original_index'] = df.index

        # Get original and flipped fingerprint bits
        original_bits = df['FingerprintBits'].iloc[0]
        flipped_bits = df['FingerprintBitsFlipped'].tolist()
        
        if not flipped_bits:
            print(f"Warning: No flipped bits found for {df[title_column].iloc[0]}")
            return
            
        # Calculate similarity matrix
        similarity_matrix = calculate_fingerprint_similarity_matrix(original_bits, flipped_bits)

        # Apply MDS
        mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
        coords = mds.fit_transform(1 - similarity_matrix)

        # Calculate similarities to original fingerprint
        similarities_to_original = similarity_matrix[0, 1:]

        # Create plot
        plt.figure(figsize=(14, 10))
        
        # Plot flipped fingerprints
        scatter = plt.scatter(coords[1:, 0], coords[1:, 1], 
                            c=similarities_to_original, 
                            cmap='viridis', 
                            alpha=0.6, 
                            s=100)
        
        # Plot original fingerprint
        plt.scatter(coords[0, 0], coords[0, 1], 
                   c='red', 
                   s=200, 
                   label='Original', 
                   edgecolors='black')

        # Add color bar and labels
        cbar = plt.colorbar(scatter)
        cbar.set_label('Tanimoto Similarity to Original', rotation=270, labelpad=20)

        for i in range(len(coords)):
            label = "Original" if i == 0 else f"{df['original_index'].iloc[i-1]}"
            plt.annotate(label, 
                        (coords[i, 0], coords[i, 1]), 
                        xytext=(5, 5),
                        textcoords='offset points', 
                        fontsize=8)

        plt.title(f"2D Projection of Single-Bit Flips for {df[title_column].iloc[0]}")
        plt.xlabel("MDS Dimension 1")
        plt.ylabel("MDS Dimension 2")
        plt.legend()
        plt.tight_layout()
        
        filename = f'{df[title_column].iloc[0].lower()}_single_bit_flips_mds_similarity.png'
        plt.savefig(os.path.join(INPUT_DIR, filename))
        plt.close()
        
        return True
        
    except Exception as e:
        print(f"Error processing {df[title_column].iloc[0] if not df.empty else 'unknown'}: {str(e)}")
        plt.close()
        return False
    
def process_and_plot(df):
    """Process dataframe and generate plot with error handling."""
    try:
        success = plot_mds_similarity(df)
        if success:
            print(f"MDS plot saved for {df['ChEMBL ID'].iloc[0]} with single-bit flips.")
        else:
            print(f"Failed to create MDS plot for single-bit flips.")
    except Exception as e:
        print(f"Error in process_and_plot: {str(e)}")

In [48]:
process_and_plot(chembl_3897759)
process_and_plot(chembl_3347413)


MDS plot saved for CHEMBL3897759 with single-bit flips.
MDS plot saved for CHEMBL3347413 with single-bit flips.


In [49]:
def plot_tsne_similarity(df, title_column='ChEMBL ID'):
    """Plot t-SNE similarity using original and flipped fingerprint bits."""
    if df.empty:
        print(f"Warning: Empty DataFrame for {title_column}")
        return False
        
    try:
        # Store original index
        if 'original_index' not in df.columns:
            df['original_index'] = df.index

        # Get original and flipped fingerprint bits
        original_bits = df['FingerprintBits'].iloc[0]
        flipped_bits = df['FingerprintBitsFlipped'].tolist()
        
        if not flipped_bits:
            print(f"Warning: No flipped bits found for {df[title_column].iloc[0]}")
            return False
            
        # Calculate similarity matrix using the same method as MDS
        similarity_matrix = calculate_fingerprint_similarity_matrix(original_bits, flipped_bits)

        # Adjust perplexity based on number of samples
        n_samples = len(flipped_bits) + 1  # +1 for original
        perplexity = min(30, max(5, n_samples - 1))
        
        # Apply t-SNE
        try:
            tsne = TSNE(n_components=2, 
                       metric='precomputed', 
                       random_state=42,
                       perplexity=perplexity, 
                       init='random', 
                       learning_rate='auto')
            coords = tsne.fit_transform(1 - similarity_matrix)
        except ValueError as e:
            print(f"t-SNE error for {df[title_column].iloc[0]}: {str(e)}")
            return False

        # Calculate similarities to original fingerprint
        similarities_to_original = similarity_matrix[0, 1:]

        # Create plot
        plt.figure(figsize=(14, 10))
        
        # Plot flipped fingerprints
        scatter = plt.scatter(coords[1:, 0], coords[1:, 1], 
                            c=similarities_to_original, 
                            cmap='viridis', 
                            alpha=0.6, 
                            s=100)
        
        # Plot original fingerprint
        plt.scatter(coords[0, 0], coords[0, 1], 
                   c='red', 
                   s=200, 
                   label='Original', 
                   edgecolors='black')

        # Add color bar and labels
        cbar = plt.colorbar(scatter)
        cbar.set_label('Tanimoto Similarity to Original', rotation=270, labelpad=20)

        for i in range(len(coords)):
            label = "Original" if i == 0 else f"{df['original_index'].iloc[i-1]}"
            plt.annotate(label, 
                        (coords[i, 0], coords[i, 1]), 
                        xytext=(5, 5),
                        textcoords='offset points', 
                        fontsize=8)

        plt.title(f"2D Projection of Single-Bit Flips for {df[title_column].iloc[0]}")
        plt.xlabel("t-SNE Dimension 1")
        plt.ylabel("t-SNE Dimension 2")
        plt.legend()
        plt.tight_layout()
        
        filename = f'{df[title_column].iloc[0].lower()}_single_bit_flips_tsne_similarity.png'
        plt.savefig(os.path.join(INPUT_DIR, filename))
        plt.close()
        
        return True
        
    except Exception as e:
        print(f"Error processing {df[title_column].iloc[0] if not df.empty else 'unknown'}: {str(e)}")
        plt.close()
        return False

def process_and_plot_tsne(df):
    """Process dataframe and generate t-SNE plot with error handling."""
    try:
        success = plot_tsne_similarity(df)
        if success:
            print(f"t-SNE plot saved for {df['ChEMBL ID'].iloc[0]} with single-bit flips.")
        else:
            print(f"Failed to create t-SNE plot for single-bit flips.")
    except Exception as e:
        print(f"Error in process_and_plot_tsne: {str(e)}")

In [50]:
process_and_plot_tsne(chembl_3897759)
process_and_plot_tsne(chembl_3347413)

t-SNE plot saved for CHEMBL3897759 with single-bit flips.
t-SNE plot saved for CHEMBL3347413 with single-bit flips.
