# Example Tanimoto Similarity Calculation with Morgan Fingerprints

This script is designed for very simple comparison of the Tanimoto Coefficient similarity with MorganFingerprints. This uses the full data as used in modeling, as these are defined for a `TriQ2`, `TriQ3`, and `TriQ4` alkene. The original database only contains it labeled as `Tri`, which is not used in this script. 

All that needs to be done is to indicate a name, smiles string, and alkene type that is within this list:
`Mono`, `Gem`, `Cis`, `Trans`, `TriQ2`, `TriQ3`, `TriQ4`, `Tetra`

## Results give a 2.0 for an exact SMILES match within the database, otherwise will be on a 0-1 scale based on Tanimoto Similarity

Note 1: Dependencies can be installed with the command `pip install numpy pandas rdkit`

Note 2: The original TriQ2, TriQ3, and TriQ4 assignments are based on the full descriptor calculation, which is unavailable in basic SMILES analysis. You can either implement the full workflow, or try all 3 trisubstituted types if you are unsure of the category.

# Input for Similarity Calculation

You can fill out the name, smiles, and the alkene types here. They should be consistent between the two dictionaries and the following script will check for that.

In [7]:
from pathlib import Path
import pandas as pd

name_smi_dict = {
    'Example1':'c1ccccc1C=C',
    'Example2':'c1ccccc1C(C)=C',
    'Example3':'CC1=CC(/C=C/C2=CC(C)=CC=C2)=CC(C)=C1'
}
name_type_dict = {
    'Example1':'Mono',
    'Example2':'Gem',
    'Example3':'Trans'
}

#This should be the directory of the path containing 
dir_path = Path(f'./Train_Test_Splits')

#This should be the path to the database
db_df = pd.read_csv("./SAD_Database.csv")

#This should be the name of the output file
out_path = './ClosestAlkenes.csv'

# Required Functions and Imports 

In [8]:
import numpy as np
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit import Chem
from rdkit.Chem.PropertyMol import PropertyMol
from rdkit import DataStructs

def isolate_indvl(full_df: pd.DataFrame, alk: str):
        alk_df = full_df.loc[full_df['Alkene Type'] == alk]
        # print(f'\nFor {alk} this is the dataframe:\n{alk_df}')
        match alk:
            case 'Mono':
                '''
                Q1 | H
                ----------
                H  | H
                '''
                alk_df = alk_df.filter(regex=r'^(?!Q[2,3,4])')
            case 'Gem':
                '''
                Q1 | H
                ----------
                Q4  | H
                '''
                alk_df = alk_df.filter(regex=r'^(?!Q[2,3])')

            case 'Cis':
                '''
                Q1 | Q2
                ----------
                H  | H
                '''
                alk_df = alk_df.filter(regex=r'^(?!Q[3,4])')

            case 'Trans':
                '''
                Q1 | H
                ----------
                H  | Q3
                '''
                alk_df = alk_df.filter(regex=r'^(?!Q[2,4])')

            case 'TriQ2':
                '''
                Q1 | H
                ----------
                Q4 | Q3
                '''
                alk_df = alk_df.filter(regex=r'^(?!Q[2])')

            case 'TriQ3':
                '''
                Q1 | Q2
                ----------
                Q4  | H
                '''
                alk_df = alk_df.filter(regex=r'^(?!Q[3])')

            case 'TriQ4':
                '''
                Q1 | Q2
                ----------
                H  | Q3
                '''
                alk_df = alk_df.filter(regex=r'^(?!Q[4])')

            case 'Tetra':
                '''
                Q1 | Q2
                ----------
                Q4  | Q3
                '''
                pass

            case _:
                raise ValueError(f"Unexpected Value Used: {alk}")
        
        return alk_df

def retrieve_unique_rdmol(db_df:pd.DataFrame, vals: pd.DataFrame):
    df_cut = db_df.loc[db_df['Reactant ID'].isin(vals.index)][['Reactant ID', 'Reactant SMILES']].drop_duplicates(keep='first')

    res = dict()
    for (name, smi) in df_cut.values:
        rdmol = Chem.MolFromSmiles(smi)
        rdmol.SetProp("_Name", name)
        res[name] = rdmol
        
    return res

def find_can_smiles(o_smi:str):
    rdmol = Chem.MolFromSmiles(o_smi)
    can_smiles = Chem.MolToSmiles(rdmol, canonical=True)
    return can_smiles

def create_rdmol(test_smi:str, name:str, alk_type:str):
    rdmol = PropertyMol(Chem.MolFromSmiles(test_smi))
    rdmol.SetProp("_Name", name)
    rdmol.SetProp("_Alkene_Type", alk_type)

    return rdmol


# Quick Sanity Check

In [9]:
all_alkene_types = [
    'Mono',
    'Gem',
    'Cis',
    'Trans',
    'TriQ2',
    'TriQ3',
    'TriQ4',
    'Tetra'
]

#Check alkene types are valid
for alk_type in name_type_dict.values():
    assert alk_type in all_alkene_types, f'Alkene type: {alk_type} not valid! Please use any of the following types:\n{all_alkene_types}'

#Checks that names are consistent between each other
for name in name_smi_dict:
    assert name in name_type_dict, f'Name: {name} not found in both dictionaries! Double check spelling'

# Code Running for Similarity

In [10]:
#Creates RDKit mol list from Input
can_smi_dict = dict()
X_val_dict = dict()
for name,smi in name_smi_dict.items():
    alk_type = name_type_dict[name]
    can_smiles = find_can_smiles(smi)
    can_smi_dict[name] = can_smiles
    rdmol = create_rdmol(can_smiles,name,alk_type)
    X_val_dict[name] = rdmol

#Generates Morgan Fingerprint
fp_gen = GetMorganGenerator(radius=3,fpSize=2048)
fps_2048_val = {name: fp_gen.GetFingerprint(alk) for name, alk in X_val_dict.items()}

#Iterates through Fingerprints
all_sim_dfs = list()
for name,fp_val in fps_2048_val.items():
    alk_type = name_type_dict[name]
    can_smi = can_smi_dict[name]

    #If the smiles is found in the database, it returns a Tanimoto value of 2.0 and continues
    if can_smi in db_df['Reactant SMILES'].values:
        #Finds the Reactant ID matching with the canonical smiles and drops duplicate values 
        max_col_obs_ee = db_df.loc[db_df['Reactant SMILES'].isin([can_smi])][['Reactant ID', 'Reactant SMILES', 'ee (%)']].drop_duplicates(keep='first')
        #Recovers the database SMILES
        db_smi = max_col_obs_ee['Reactant SMILES'].values[0]
        #Averages the ee when there are multiple values
        max_col_obs_ee = max_col_obs_ee.groupby('Reactant ID', as_index=False)['ee (%)'].mean()
        #Changes the names of the columns
        max_col_obs_ee = max_col_obs_ee.rename(columns={'Reactant ID': 'Closest Alkene', 'ee (%)': 'Observed ee (%)'})
        #Creates a dataframe with similarity values
        sim_df = pd.DataFrame({'Maximum Tanimoto Similarity':2.0, 'Closest Alkene':max_col_obs_ee['Closest Alkene'].values[0]}, index=[name])
        #Merges on the closest alkene
        final_sim_df = sim_df.merge(max_col_obs_ee, on='Closest Alkene')
        final_sim_df['Example Canonical SMILES'] = can_smi
        final_sim_df['Closest DB SMILES'] = db_smi
        final_sim_df.index = [name]
        all_sim_dfs.append(final_sim_df)
        continue

    #Retrieve Train/Test Splits
    X_train = pd.read_hdf(f"{dir_path}/{alk_type}_Train_Test_Split.hdf5", key='X_Train')
    X_test = pd.read_hdf(f"{dir_path}/{alk_type}_Train_Test_Split.hdf5", key='X_Test')
    full_X_df = pd.concat([X_train, X_test])

    #Retrieve RDMols and make fingerprints
    full_X_rdmols = retrieve_unique_rdmol(db_df, full_X_df)
    fps_2048_full = {name: fp_gen.GetFingerprint(alk) for name, alk in full_X_rdmols.items()}

    #Creates an array to fill
    all_arr = np.empty((1, len(fps_2048_full)))

    #Iterates through all values in the full dataframe
    test_sims = list()
    for train in fps_2048_full.keys():
        train_fp = fps_2048_full[train]
        sim = DataStructs.TanimotoSimilarity(fp_val, train_fp)
        test_sims.append(sim)

    #Updates array
    all_arr[0,:] = test_sims

    #Creates a dataframe with similarity values
    sim_df = pd.DataFrame(data=all_arr, columns=list(fps_2048_full.keys()), index=[name])

    #Finds the maximum similarity value
    max_sim = np.max(sim_df, axis=1)

    #Identifies the maximum value and its label
    max_col_idx = np.argmax(sim_df.values,axis=1)
    max_col_labels = sim_df.columns[max_col_idx]
    max_col_ser = pd.Series(max_col_labels,index=sim_df.index)

    #Finds the Reactant ID matching with the canonical smiles and drops duplicate values 
    max_col_obs_ee = db_df.loc[db_df['Reactant ID'].isin(max_col_labels)][['Reactant ID', 'Reactant SMILES', 'ee (%)']].drop_duplicates(keep='first')
    #Recovers the database SMILES
    db_smi = max_col_obs_ee['Reactant SMILES'].values[0]
    #Averages the ee when there are multiple values
    max_col_obs_ee = max_col_obs_ee.groupby('Reactant ID', as_index=False)['ee (%)'].mean()
    #Changes the names of the columns
    max_col_obs_ee = max_col_obs_ee.rename(columns={'Reactant ID': 'Closest Alkene', 'ee (%)': 'Observed ee (%)'})

    #Concatenates the series based on maximum similarity and closest alkene
    final_sim_df = pd.concat([max_sim, max_col_ser],axis=1)
    #Renames the columns
    final_sim_df.columns = ['Maximum Tanimoto Similarity', 'Closest Alkene']
    #Merges on the closest alkene
    final_sim_df = final_sim_df.merge(max_col_obs_ee, on='Closest Alkene')
    final_sim_df.index = [name]
    final_sim_df['Example Canonical SMILES'] = can_smi
    final_sim_df['Closest DB SMILES'] = db_smi
    all_sim_dfs.append(final_sim_df)

full_df = pd.concat(all_sim_dfs)
print(full_df)
print(f'Output written {out_path}')
full_df.to_csv(out_path)

          Maximum Tanimoto Similarity Closest Alkene  Observed ee (%)  \
Example1                     2.000000       react_15             97.0   
Example2                     2.000000       react_13             93.5   
Example3                     0.243902      react_695             91.0   

                Example Canonical SMILES            Closest DB SMILES  
Example1                     C=Cc1ccccc1                  C=Cc1ccccc1  
Example2                  C=C(C)c1ccccc1               C=C(C)c1ccccc1  
Example3  Cc1cccc(/C=C/c2cc(C)cc(C)c2)c1  Cc1ccc(/C=C/c2ccc(C)cc2)cc1  
Output written ./ClosestAlkenes.csv
