In [3]:
import os
import random
import pandas as pd
import numpy as np
import scipy.sparse as sp
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from scipy.spatial.distance import squareform
from scipy.stats import fisher_exact

from sklearn.cluster import SpectralClustering
from sklearn.model_selection import KFold, train_test_split, GroupKFold

from rdkit import Chem
from rdkit.Chem import QED, AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.DataStructs.cDataStructs import BulkTanimotoSimilarity, ExplicitBitVect

%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

random_seed = 1
np.random.seed(random_seed)
random.seed(random_seed)

In [15]:
def get_fingerprints(mol, rad=2):

    fps = dict()
    fps['ec_bit_fp'] = AllChem.GetMorganFingerprintAsBitVect(mol, rad, nBits=2048)
    fps['rdkit_bit_fp'] = Chem.RDKFingerprint(mol, maxPath=rad * 2)

    return fps

def dataset_to_fingerprints(df_raw):

    smiles = df_raw.Reactant_SMILES.unique()
    smiles = set([Chem.CanonSmiles(s) for s in smiles])

    big_smiles   = []
    small_smiles = []
    for smiles in smiles:
        mol = Chem.MolFromSmiles(smiles)
        num_C = [atom.GetAtomicNum() for atom in mol.GetAtoms()].count(6)
        if num_C > 15:
            big_smiles.append(smiles)
        else:
            small_smiles.append(smiles)

    fps = [get_fingerprints(Chem.MolFromSmiles(mol)) for mol in small_smiles+big_smiles]

    return pd.DataFrame(fps), len(small_smiles), len(small_smiles+big_smiles)

# precompute the pairwise distances between all molecules for the rdkit and ecfp bit vectors

def precompute_kernel(bit_vecs_sp):
    # convert scipy sparse fingerprints back to rdkit bit vectors
    bit_vecs_rdkit = []
    print (bit_vecs_sp)
    for bv in bit_vecs_sp:
        # bv = ExplicitBitVect(bit_vec.shape[1])
        # bv.SetBitsFromList(bv.indices.tolist())
        bit_vecs_rdkit.append(bv)


    # generate pair-wise distances and similarities
    dists = []
    for i in tqdm(range(len(bit_vecs_rdkit))):
        dists.extend(
            BulkTanimotoSimilarity(
                bit_vecs_rdkit[i],
                bit_vecs_rdkit[(i+1):],
                returnDistance=True
            )
        )
    dists = squareform(dists)
    return dists

## Calculate kernels

In [None]:
df_dioxirane, n_train_dioxirane, len_dioxirane  = dataset_to_fingerprints(pd.read_csv(f"../data/descriptors/preprocessed_dioxirane_reactions/df_custom.csv", index_col=0))
pk_dioxirane = {fp: precompute_kernel(df_dioxirane[fp]) for fp in ["ec_bit_fp", "rdkit_bit_fp"]}

df_borylation, n_train_borylation, len_borylation  = dataset_to_fingerprints(pd.read_csv(f"../data/descriptors/preprocessed_borylation_reactions/df_custom.csv", index_col=0))
pk_borylation = {fp: precompute_kernel(df_borylation[fp]) for fp in ["ec_bit_fp", "rdkit_bit_fp"]}


## Calculate covariate shift

In [28]:
from random import sample
def kernel_2samp_test(train_idx, sim_mat, verbose=False):
    """
    Implements the unbiased estimator of the maximum mean discrepancy statistic for a training index splitting a given kenrel similarity matrix.
    """

    train_mean = (sim_mat[np.ix_(train_idx, train_idx)].sum() - train_idx.shape[0]) / (train_idx.shape[0] * (train_idx.shape[0] - 1))
    test_mean = (sim_mat[np.ix_(~train_idx, ~train_idx)].sum() - (~train_idx).shape[0]) / ((~train_idx).shape[0] * ((~train_idx).shape[0] - 1))
    train_test_mean = (sim_mat[np.ix_(train_idx, ~train_idx)].sum()) / (train_idx.shape[0] * ((~train_idx).shape[0]))
    if verbose:
      print(f"train mean - {train_mean:.3f}, test mean - {test_mean:.3f}, train-test mean - {train_test_mean:.3f}")

    mmd_squared = train_mean + test_mean - 2 * train_test_mean
    if not np.all(mmd_squared > 0):
      return 0

    return np.sqrt(mmd_squared)


print ("dioxirane results")
cov_shift_results = {}

for kernel in pk_dioxirane:
    cov_shift_results[kernel] = [kernel_2samp_test(np.array(sample(range(len_dioxirane), n_train_dioxirane)), 1 - pk_dioxirane[kernel])]
    cov_shift_results[kernel].append(kernel_2samp_test(np.array(sample(range(n_train_dioxirane), n_train_dioxirane)), 1 - pk_dioxirane[kernel]))

cov_shift_results = pd.DataFrame(cov_shift_results, index = ['random', 'carbons number'])
print (cov_shift_results)

print ("borylation results")
cov_shift_results = {}

for kernel in pk_borylation:
    cov_shift_results[kernel] = [kernel_2samp_test(np.array(sample(range(len_borylation), n_train_borylation)), 1 - pk_borylation[kernel])]
    cov_shift_results[kernel].append(kernel_2samp_test(np.array(sample(range(n_train_borylation), n_train_borylation)), 1 - pk_borylation[kernel]))

cov_shift_results = pd.DataFrame(cov_shift_results, index = ['random', 'carbons number'])
print (cov_shift_results)


dioxirane results
                ec_bit_fp  rdkit_bit_fp
random           0.000000      0.000000
carbons number   0.072092      0.090653
borylation results
                ec_bit_fp  rdkit_bit_fp
random           0.000000      0.000000
carbons number   0.128525      0.122086
