In [None]:
!conda install -c tmap tmap

In [None]:
import pandas as pd
import tmap
from faerun import Faerun
from mhfp.encoder import MHFPEncoder
from rdkit.Chem import AllChem
import numpy as np
from rdkit import Chem
from tqdm import tqdm
from drfp import DrfpEncoder
import matplotlib.pyplot as plt

**Import dataset from Medina et al.**

In [None]:
mixture_df = pd.read_csv('./data/database_IAC_ln_clean.csv', index_col=0)
mixture_df

**Obtain concatenated solvent and solute representations**

In [None]:
rxn_smi = []
mixture_smi = []
for i in range(mixture_df.shape[0]):    
    rxn_smi.append(mixture_df['Solvent_SMILES'].iloc[i]+'>>'+mixture_df['Solute_SMILES'].iloc[i])
    mixture_smi.append(mixture_df['Solvent_SMILES'].iloc[i]+'.'+mixture_df['Solute_SMILES'].iloc[i])
    
mixture_df['rxn_smi'] = rxn_smi
mixture_df['mixture_smi'] = mixture_smi

**Encode solvent/solute mixtures as fingerprints**

In [None]:
bits = 1024 # sometimes crashes with higher bit size
rxn_fps = DrfpEncoder.encode(mixture_df['rxn_smi'].tolist(),n_folded_length=bits)
rxn_fps = np.array(rxn_fps)
rxn_fps.shape

**Hashing to prepare fingerprint array for tmap plotting**

In [None]:
mh_encoder = tmap.Minhash(bits)
lf = tmap.LSHForest(bits)

mhfps = [mh_encoder.from_weight_array(fp.tolist(), method="I2CWS") for fp in tqdm(rxn_fps)]

In [None]:
lf.batch_add(mhfps)
lf.index()

**Plot tmap**

In [None]:
# each plot of the tmap will be slightly different due to hashing
x, y, s, t, _ = tmap.layout_from_lsh_forest(lf)

# Now plot the data
faerun = Faerun(view="front", coords=False) #clear_color="#FFFFFF")
faerun.add_scatter(
    "liquid_mixture",
    {   "x": x, 
        "y": y, 
        "c": list(mixture_df.Literature.values),
        "labels": mixture_df['mixture_smi']}, 
    point_scale=5,
    colormap = ['plasma'],
    has_legend=True,
    legend_title = ['Activity coefficients'],
    categorical=[False],
    shader = 'smoothCircle'
)

faerun.add_tree("liquid_mixture_tree", {"from": s, "to": t}, point_helper="liquid_mixture")

# Choose the "smiles" template to display structure on hover
faerun.plot('liquid_mixture', template="smiles", notebook_height=750)

**Plotting histogram of activity coefficients**

In [None]:
def friedman_diaconis_bins(data):
    """Compute the number of bins using the Friedman-Diaconis rule."""
    q75, q25 = np.percentile(data, [75 ,25])
    iqr = q75 - q25
    bin_width = 2 * iqr * len(data) ** (-1/3)
    return int((data.max() - data.min()) / bin_width)


data = np.array(mixture_df.Literature.values)
num_bins = friedman_diaconis_bins(data)

fig, ax = plt.subplots()
plt.hist(data, bins=num_bins, color=plt.cm.plasma(0.5))  
plt.title('Histogram of solvent/solute activity coefficients')
plt.xlabel('Activity coefficients')
plt.ylabel('Frequency')
plt.show()
fig.savefig('histogram.pdf')