In [5]:
import os
import pickle

import numpy as np
import pandas as pd
import scipy.stats as ss
import tmap as tm
from faerun import Faerun
from matplotlib.colors import ListedColormap
from rdkit import Chem
from rdkit.Chem import AllChem


In [6]:

# Try to make the TMAPS for the 4 cancer sites:

drug_df = pd.read_csv(
    os.path.join(
        os.path.expanduser('~'), 'Box', 'Molecular_SysBio', 'data', 'paccmann',
        'paccmann_rl', 'panel_drugs.csv'
    ),
    index_col=0
)
gen_df = pd.read_csv(
    os.path.join(
        os.path.expanduser('~'), 'Box', 'Molecular_SysBio', 'data', 'paccmann',
        'paccmann_rl', 'biased_models', 'best_generated_drugs.csv'
    ),
    index_col=0
)
gen_df['drug'] = 'N.A.'
gen_df['source'] = 'Generated'
drug_df['source'] = 'GDSC/CCLE'
drug_df.rename(
    columns={
        'IC50_best_site': 'IC50',
        'scscore': 'SCScore'
    }, inplace=True
)
gen_df.rename(
    columns={
        'cell_line': 'cancer_site',
        'esol': 'ESOL'
    }, inplace=True
)

# Join the dataframes
mol_df = pd.concat([gen_df, drug_df], join='inner')




In [7]:

# Subselect the data frame based on the site:
site = 'prostate'

In [8]:

def tm_morgan_vector(smiles):
    return tm.VectorUint(
        AllChem.GetMorganFingerprintAsBitVect(
            Chem.MolFromSmiles(smiles), radius=2, nBits=512
        )
    )


df = mol_df[mol_df.cancer_site == site]
tmap_fps = list(df.SMILES.apply(tm_morgan_vector, convert_dtype=False))

root = 'https://pubchem.ncbi.nlm.nih.gov/#query='

labels_new = []
for ind, (smiles, source,
          drug) in enumerate(zip(df.SMILES, df.source, df.drug)):
    if drug != 'N.A.':
        labels_new.append(
            smiles + '__<a href="' + root + drug + '">' + drug + '</a>' +
            '__' + smiles
        )
    else:
        labels_new.append(smiles + '__No link available' + '__' + smiles)
lf = tm.LSHForest(512, 32)
lf.batch_add(tmap_fps)
lf.index()
groups = list(df.source.values)

# Store data
# For this path to work the kernel needs to run from paccmann_chemistry or higher
save_path = os.path.join(
    os.path.expanduser('~'), 'paccmann', 'paccmann_datasets', 'pytoda', 'visualizations', 'molecules', 'tmap', site
)
os.makedirs(save_path, exist_ok=True)

lf.store(os.path.join(save_path, 'data.dat'))
with open(os.path.join(save_path, 'properties.dat'), 'wb+') as f:
    pickle.dump(df, f, protocol=pickle.HIGHEST_PROTOCOL)

qed_ranked = ss.rankdata(np.array(df.QED) / max(df.QED)) / len(df.QED)
scscore_ranked = (
    ss.rankdata(np.array(df.SCScore) / max(df.SCScore)) / len(df.SCScore)
)
molwt_ranked = (
    ss.rankdata(np.array(df.mol_weight) / max(df.mol_weight)) /
    len(df.mol_weight)
)
esol_ranked = ss.rankdata(np.array(df.ESOL) / max(df.ESOL)) / len(df.ESOL)

labels_groups, groups = Faerun.create_categories(groups)
cfg = tm.LayoutConfiguration()
cfg.k = 20
cfg.sl_extra_scaling_steps = 10
# cfg.sl_repeats = 12
# cfg.mmm_repeats = 2
cfg.node_size = 50

x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)
x = list(x)
y = list(y)
s = list(s)
t = list(t)
pickle.dump(
    (x, y, s, t),
    open(os.path.join(save_path, 'coords.dat'), 'wb+'),
    protocol=pickle.HIGHEST_PROTOCOL
)

custom_cmap = ListedColormap(
    ['#eb2f06', '#eb2f06', '#fa983a', '#78e08f', '#78e08f', '#4a69bd'],
    name='custom',
)

bin_cmap = ListedColormap(['#e74c3c', '#2ecc71'], name='bin_cmap')

f = Faerun(
    clear_color='#222222',
    coords=False,
    view='front',
    impress=
    'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />'
    'and <a href="https://github.com/reymond-group/faerun-python" target='
    '"_blank">faerun</a>'
)

f.add_scatter(
    'molecules',
    {
        'x': x,
        'y': y,
        'c': [groups, qed_ranked, scscore_ranked, esol_ranked, molwt_ranked],
        'labels': labels_new
    },
    shader='sphere',
    colormap=[bin_cmap, 'viridis'],
    point_scale=20,
    max_point_size=100,
    categorical=[True, False, False, False, False],
    has_legend=True,
    legend_labels=labels_groups,
    selected_labels=['SMILES', 'Dashboard', 'Name'],
    series_title=['Group', 'QED', 'SCScore', 'ESOL', 'MolWt'],
    max_legend_label=[
        None,
        str(round(max(df.QED))),
        str(round(max(df.SCScore))),
        str(round(max(df.ESOL))),
        str(round(max(df.mol_weight)))
    ],
    min_legend_label=[
        None,
        str(round(min(df.QED))),
        str(round(min(df.SCScore))),
        str(round(min(df.ESOL))),
        str(round(min(df.mol_weight)))
    ],
    title_index=2,
    legend_title='',
)
f.add_tree(site, {'from': s, 'to': t}, point_helper='molecules')

f.plot(file_name=site, path=os.path.join('tmap', site), template='smiles')


tmap/prostate/prostate.html
