###### How to run this notebook (command-line)?

# CHEMCHARTS 0.0: chemcharts learning demo

In [None]:
Aim of chemcharts

what it can do

how to install

In [None]:
%load_ext autoreload
%autoreload 2

In [8]:
# load dependencies 

import rdkit
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem

import collections
from collections import defaultdict

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.tri as mtri
from matplotlib import cm
from matplotlib.ticker import MaxNLocator
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D

import math
import statistics
import csv
import pandas as pd
import umap
import numpy as np
import pylab
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go

from scipy.spatial import Delaunay

from sklearn.cluster import KMeans


import argparse
import sys
import dill

from chemcharts.core.container.chemdata import ChemData
from chemcharts.core.container.fingerprint import *

from chemcharts.core.functions.binning import Binning
from chemcharts.core.functions.dimensional_reduction import DimensionalReduction
from chemcharts.core.functions.clustering import Clustering
from chemcharts.core.functions.filtering import Filtering

from chemcharts.core.plots.hexag_plot import HexagonalPlot
from chemcharts.core.plots.histogram_plot import HistogramPlot
from chemcharts.core.plots.scatter_boxplot_plot import ScatterBoxplotPlot
from chemcharts.core.plots.scatter_interactive import ScatterInteractivePlot
from chemcharts.core.plots.scatter_static_plot import ScatterStaticPlot
from chemcharts.core.plots.trisurf_interactive_plot import TrisurfInteractivePlot
from chemcharts.core.plots.trisurf_static_plot import TrisurfStaticPlot

from chemcharts.core.functions.io_functions import load_smiles

from chemcharts.core.utils.enums import GeneratePlotsEnum
from chemcharts.core.utils.enums import DataFittingEnum
from chemcharts.core.utils.enums import PlottingEnum
_GPE = GeneratePlotsEnum
_DFE = DataFittingEnum
_PE = PlottingEnum


In [12]:
import chemcharts as cc
from chemcharts.core.container.chemdata import ChemData

# LOAD DATA AND GENERATE COLUMNS_DIC

In [13]:
# load data and generate an object for each molecule
loaded_data = pd.read_csv("../data/scaffold_memory.csv")
scaffold_smiles = list(loaded_data["SMILES"])
scaffold_scores = list(loaded_data["total_score"])

loaded_data = pd.read_csv(path)
smiles = Smiles(list(loaded_data[smiles_column]))
scores = list(loaded_data[scores_column])
epoch = list(loaded_data[epochs_column])


# GENERATE INSTANCES OF SMILES AND DATA_SET CLASSES

In [15]:
# generate an instance of the data_set class and set to COX2_set variable
scaffold_set = ChemData(name = "Scaffold", smiles_obj = cc.core.smiles.Smiles(scaffold_smiles), total_score = scaffold_scores)

# new:
#scaffold_set = cc.core.dataset.Data_Set()



AttributeError: module 'chemcharts.core' has no attribute 'smiles'

In [None]:
#print(scaffold_set.smiles_obj.smiles_list)
#print(type(scaffold_set.total_score[0]))

# ADD FINGERPRINTS DEFINITION AND CALL

In [None]:
# define function to add fingerprints to data_set
def add_fingerprints(data_set_obj):
    fp_gen = cc.core.fingerprint.FingerprintGenerator(data_set_obj.smiles_obj.smiles_list)
    data_set_obj.add_fingerprint(fp_gen.generate_fingerprints())
    data_set_obj.add_fingerprint(fp_gen.generate_fingerprints_morgan())
    data_set_obj.add_fingerprint(fp_gen.generate_fingerprints_maccs())
    
    
# call function to add fingerprints to instances of data_set class
add_fingerprints(scaffold_set)

In [None]:
# print length of fingerprint_lists

#print(len(scaffold_set.fingerprint_lists))


# DIMENSIONAL REDUCTION DEFINITION

In [None]:
# generating array list of fingerprint_lists
def generating_array_list(fingerprint_obj):
    array_list = []
    for fingerprint in fingerprint_obj:
        array = np.array(fingerprint)
        array_list.append(array)
    return array_list


def dimensional_reduction(fingerprint_obj):
    np.random.seed(42)
   
    array_list = generating_array_list(fingerprint_obj)
    
    reducer = umap.UMAP()
    embedding = cc.core.embedding.Embedding(reducer.fit_transform(array_list))
    
    return embedding


def calculate_embedding(data_set_obj):
    for fingerprint_obj in data_set_obj.fingerprint_lists:
        array_list = generating_array_list(fingerprint_obj)
        embedding = dimensional_reduction(array_list)
        data_set_obj.add_embedding_list(embedding)



In [None]:
# call function to add embedding to instances of data_set class
calculate_embedding(scaffold_set)



# PLOT DEFINITION

In [None]:
class BasePlot:
    def __init__(self, scaffold_set):
        self.scores = [float(score) for score in scaffold_set.total_score]
        self.embedding_list = scaffold_set.embedding_lists
    
    @classmethod
    def filter_range(cls, embedding, range_x, range_y, scores):
        embedding_df = pd.DataFrame({"UMAP_1": embedding.np_array[:,0], "UMAP_2": embedding.np_array[:,1], "Scores": scores})
    
        df = embedding_df[embedding_df['UMAP_1'].between(range_x[0], range_x[1])]
        df = df[df['UMAP_2'].between(range_y[0], range_y[1])]
    
        filtered_array = df.to_numpy()
        filtered_embedding_data = cc.core.embedding.Embedding(filtered_array)
        filtered_scores = list(df["Scores"])
        filtered_df = df
   
        return filtered_embedding_data, filtered_array, filtered_scores, filtered_df 
    
    @classmethod
    def clustering(cls, embedding, scores, k):
    
        assert len(embedding) >= k
    
        kmeans = KMeans(n_clusters=k, random_state=0).fit(embedding)
        x_cl = kmeans.cluster_centers_[:,0]
        y_cl = kmeans.cluster_centers_[:,1]
    
        def generate_score_list(scores):
            score_list = []
            for cluster_idx in range(len(kmeans.cluster_centers_)):
                tmp = []
                for label_idx in range(len(kmeans.labels_)):
                    if kmeans.labels_[label_idx] == cluster_idx:
                        tmp.append(scores[label_idx])
                score_list.append(statistics.median(tmp))
            return score_list
    
        z_cl = generate_score_list(scores)
    
        return x_cl, y_cl, z_cl    

    
    
class HexagonalPlot(BasePlot):
    def __init__(self, scaffold_set):
        BasePlot.__init__(self, scaffold_set)
        
    def plot(self):
        for scaffold_index in range(len(self.embedding_list)):
            embedding = self.embedding_list[scaffold_index]
               
            filtered_embedding_data, filtered_array, filtered_scores, filtered_df = self.filter_range(embedding, (-10, 10), (-10, 6), self.scores)
        
            # with clustering and filtering
            k = 200
            #x_cl, y_cl, z_cl = clustering(filtered_array, filtered_scores, k)  
        
            # without clustering but with filtering
            #x_cl, y_cl, z_cl = filtered_array[:,0], filtered_array[:,1], filtered_scores
        
            # without clustering and filtering
            x_cl, y_cl, z_cl = embedding.np_array[:,0], embedding.np_array[:,1], self.scores
        
            sns.jointplot(x=x_cl, y=y_cl, kind="hex", color="#4CB391")
       
    
class ScatterBoxplotPlot(BasePlot):
    def __init__(self, scaffold_set):
        BasePlot.__init__(self, scaffold_set)
        
    def plot(self):
        for scaffold_index in range(len(self.embedding_list)):
            embedding = self.embedding_list[scaffold_index]
            
            k = 100
            x_cl,y_cl,z_cl = self.clustering(embedding.np_array, self.scores, k)
        
            # with clustering
            scatter_df = pd.DataFrame({"UMAP_1": x_cl, "UMAP_2":y_cl, "z":z_cl})
        
            # without clustering
            #scatter_df = pd.DataFrame("UMAP_1": embedding.np_array[:,0], "UMAP_2":embedding.np_array[:,1], "z":self.scores)
        
            sns.set_context("talk", font_scale=0.8)
            plt.figure(figsize=(17,17))
            g = sns.JointGrid(data=scatter_df, 
                              x="UMAP_1",
                              y="UMAP_2")
        
            g.plot_joint(sns.scatterplot)
            g.plot_marginals(sns.boxplot)
            plt.savefig("Scatter_Boxplot_ChemChar_Plot.png",
                        format='png',dpi=150)
    
    
    
       
class TrisurfPlot(BasePlot):
    def __init__(self, scaffold_set):
        BasePlot.__init__(self, scaffold_set)
        
    
    def plot(self):
        for scaffold_index in range(len(self.embedding_list)):
            embedding = self.embedding_list[scaffold_index]
                
            filtered_embedding_data, filtered_array, filtered_scores, filtered_df = self.filter_range(embedding, (-10, 10), (-10, 6), self.scores)
        
            k = 20
            x_cl, y_cl, z_cl = self.clustering(filtered_array, filtered_scores, k)
        
            tri = Delaunay(np.array([x_cl,y_cl]).T)
            simplices = tri.simplices
        
            fig = ff.create_trisurf(x=x_cl, y=y_cl, z=z_cl,
                         colormap="Portland",
                         simplices=simplices,
                         title="Trisurf ChemCharts Plot")
 
        fig.show()
    
    
class ScatterPlot(BasePlot):
    def __init__(self, scaffold_set):
        BasePlot.__init__(self, scaffold_set)
        
    def plot(self): 
        fig = plt.figure()
        ax = fig.add_subplot(projection='3d')
        
        for scaffold_index in range(len(self.embedding_list)):
            embedding = self.embedding_list[scaffold_index]
        
            area = np.pi*1
            plt.gcf().set_size_inches((15, 15))
    
            ax.scatter(embedding.np_array[:,0], embedding.np_array[:,1], zs=self.scores, s=1)
        
            ax.set_title("Scatter ChemCharts Plot")
            ax.set_xlabel('UMAP 1')
            ax.set_ylabel('UMAP 2')
            ax.set_zlabel('Scores')
        
            plt.savefig("Scatter_ChemChar_Plot.png") 
    
        fig.show
   

In [6]:
hexagonal_test_plot = HexagonalPlot(scaffold_set)
scatter_2d_boxplot_test_plot = ScatterBoxplotPlot(scaffold_set)

trisurf_test_plot = TrisurfPlot(scaffold_set)
scatter_3d_test_plot = ScatterPlot(scaffold_set)


NameError: name 'HexagonalPlot' is not defined

# PLOT GENERATION  

In [4]:
hexagonal_test_plot.plot()

NameError: name 'hexagonal_test_plot' is not defined

In [5]:
scatter_2d_boxplot_test_plot.plot()

NameError: name 'scatter_2d_boxplot_test_plot' is not defined

In [None]:
trisurf_test_plot.plot()

In [None]:
scatter_3d_test_plot.plot()