In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from itertools import combinations
import plotly.express as px

Compute activity Cliffs (pairs or groups of structurally similar compounds that are active against the same target but have large differences in potency) from an xls file containing a set of molecules. The columns "standardize_smiles", "PubMedID" and "pIC50" are mandatory. 
The program uses rdkit to compute Morgan fingerprints of each molecule. When the PubMedID is the same, for each pair of molecules a disparity value is calculated as disparity = pIC50_diff / (1 - tanimoto).

The program needs pandas, numpy, tqdm, rdkit, combinations and plotly.express libraries

Example on an input file https://www.mdpi.com/article/10.3390/ijms23010259/s1 from Macip G, Garcia-Segura P, Mestres-Truyol J, Saldivar-Espinoza B, Pujadas G, Garcia-Vallvé S. A Review of the Current Landscape of SARS-CoV-2 Main Protease Inhibitors: Have We Hit the Bullseye Yet? Int J Mol Sci. 2021 Dec 27;23(1):259. doi: 10.3390/ijms23010259

In [2]:
# Definitions of some functions

def fp_as_bitvect(mol, n_bits=2048):
    """Generates a Morgan fingerprint as an ExplicitBitVect"""
    return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)

def compute_tanimoto(fp1, fp2):
    """Calculates the Tanimoto similarity between two fingerprints"""
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def generate_pairs(df):
    """Generates all possible pairs of compounds with the same PubMedID"""
    pair_data = []
    
    # Iterate through PubMedID groups
    for pubmed_id, group in df.groupby('PubMedID'):
        # Generate all possible combinations (pairs)
        for row1, row2 in combinations(group.itertuples(index=False), 2):
            tanimoto = compute_tanimoto(row1.fingerprints, row2.fingerprints)
            pIC50_diff = abs(row1.pIC50 - row2.pIC50)
            if tanimoto == 1:
                disparity = pIC50_diff
            else:
                disparity = pIC50_diff / (1 - tanimoto)
            pair_data.append({
                'PubMedID': pubmed_id,
                'Compound1': row1.Compound,
                'pIC50_1': row1.pIC50,
                'Compound2': row2.Compound,
                'pIC50_2': row2.pIC50,
                'pIC50_diff': pIC50_diff,
                'Tanimoto': tanimoto,
                'Disparity':disparity
            })
    
    return pd.DataFrame(pair_data)

In [3]:
# Read input file
df = pd.read_excel('M-pro_Inhibitors.xls')
df["fingerprints"] = df["standardize_smiles"].apply(lambda x: fp_as_bitvect(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else None) # Calculate fingerprints
result_df = generate_pairs(df)
result_df = result_df.sort_values(by='Disparity', ascending=False)
result_df.to_csv('disparity_results.csv', index=False)
print(result_df)

                      PubMedID     Compound1   pIC50_1     Compound2  \
3166                  34198327   Z-SAVLD-FMK  7.769551  Z-ASAVLD-FMK   
3165                  34198327    Z-AVLD-FMK  9.045757  Z-ASAVLD-FMK   
51                    32798789   184904-82-3  7.346787  2488719-74-8   
2524                  33891389  2596275-64-6  4.454693  2596275-66-8   
2997                  34118724  2648754-66-7  7.301030  1681060-26-3   
...                        ...           ...       ...           ...   
4705  10.1021/acsptsci.0c00216     CCG-50014  6.823909         ML311   
2925                  33915460    4299-09-06  6.387216  1909243-65-7   
3012                  34118724  2648754-66-7  7.301030  2648754-86-1   
3005                  34118724  2648754-66-7  7.301030  2648754-73-6   
2786                  33915460   153871-75-1  6.886057   443286-41-7   

       pIC50_2  pIC50_diff  Tanimoto  Disparity  
3166  6.585027    1.184524  0.968254  37.312519  
3165  6.585027    2.460731  0.91935

In [4]:
# Create the interactive chart
fig = px.scatter(
    result_df,
    x='Tanimoto',
    y='pIC50_diff',
    title='Activity difference vs Tanimoto Similarity',
    labels={'Tanimoto': 'Tanimoto Similarity', 'pIC50_diff': 'Activity difference (ΔpIC50)'},
    hover_data=['Compound1', 'Compound2', 'Disparity']
)
# Adjust the chart dimensions
fig.update_layout(
    height=700  # You can change this value to adjust the height
)
# Show the graph
fig.show()