In [None]:
from pathlib import Path
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, PandasTools, AllChem
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect

import os
import seaborn as sns
sns.set(style ='darkgrid')


#from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams


from rdkit import Chem, DataStructs
from rdkit.Chem import (
    PandasTools,
    Draw,
    Descriptors,
    MACCSkeys,
    rdFingerprintGenerator,)
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.metrics import average_precision_score

class Similarity:
     """""
    Input:
    - active_data: dataframe
        dataframe with active compounds for similarity searching
    - data: dataframe
        dataframe for validation
    - mol_col: string
        name of mol column
    - active_col: string
        name of bioactive column ~ pIC50
    
    Return:
    -
    """""
    def __init__(self,active_data, data, mol_col, active, ID):
        self.active_data = active_data
        self.data = data
        self.mol_col = mol_col
        self.active = active
        self.ID = ID
    
    def mol2ecfp(self, mol):
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits = 2048)
            ar = np.zeros((1,), dtype=np.int8)
            DataStructs.ConvertToNumpyArray(fp, ar)
            return ar
        
    def mol2fp(self):
        self.fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits = 2048) 
                             for mol in self.data[self.mol_col]]
    
     
    
    def plot_roc (self, y_test, y_score, model):
        """ Calculates and plots and ROC and AUC.
        Parameters:
        actives_list - binary array of active/decoy status.
        score_list - array of experimental scores.
        """
        fpr, tpr, _ = roc_curve(y_test, y_score)
        roc_auc = round(auc(fpr, tpr),3)
        # Plot figure
        #sns.set('notebook', 'whitegrid', 'dark', font_scale=1.5, font='Ricty',
        #rc={"lines.linewidth": 2, 'grid.linestyle': '--'})
        #plt.figure(figsize = (12,8))
        lw = 2
        plt.plot(fpr, tpr, 
                 lw=lw, label=f'{model} (AUC = %0.3f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate', fontsize = 18)
        plt.ylabel('True Positive Rate', fontsize = 18)
        plt.title('Receiver operating characteristic', fontsize = 36, weight = 'semibold')
        plt.legend(loc="lower right")
        
    def plot_ap(self, y_test, y_score, model):
        """ Calculates and plots and ROC and AUC.
        Parameters:
        actives_list - binary array of active/decoy status.
        score_list - array of experimental scores.
        """
        precision, recall, thresholds = precision_recall_curve(y_test, y_score)
        ap = average_precision_score(y_test, y_score)
        lw = 2
        plt.plot(recall, precision, 
                 lw=lw, label=f'{model} (AP = %0.3f)' % ap)
        plt.plot([0, 0], [0, 0], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Rrecision', fontsize = 18)
        plt.ylabel('Recall', fontsize = 18)
        plt.title('Precision Recall curve', fontsize = 36, weight = 'semibold')
        plt.legend(loc="lower right")
        
    def similarity_validate(self):
        self.mol2fp()
        plt.figure(figsize=(12,8))
        for i, j in enumerate(self.active_data[self.mol_col]):
            fp = AllChem.GetMorganFingerprintAsBitVect(j, radius=2, nBits = 2048)
            self.data["tanimoto_morgan"] = DataStructs.BulkTanimotoSimilarity(fp, self.fingerprints)
            
            #self.plot_roc(self.data[self.active], self.data["tanimoto_morgan"], model = self.active_data.iloc[i,:].ID)
            
            
            self.plot_ap(self.data[self.active], self.data["tanimoto_morgan"], model = self.active_data.iloc[i,:].ID)
        
    
    def ensemble_fp(self):
        
        # Ensemble

     
        self.active_data['fp'] = self.active_data['ROMol'].apply(self.mol2ecfp)
        X = np.stack(self.active_data['fp'])
        self.df_ensemble = pd.DataFrame()
        
        for i in range(X.shape[0]):
            en_fp = X.sum(axis = 0)
            en_fp[en_fp > i] = 1

            arr_tostring = "".join(en_fp.astype(str))
            arr_tostring

            # Chuyển string của 0, 1 thành ExplicitVBitVect, cái này cũng dùng cho tanimoto được luôn
            EBitVect = DataStructs.cDataStructs.CreateFromBitString(arr_tostring)
            df_EBitVect = pd.DataFrame([EBitVect])
            df_EBitVect.columns = [f'Ensemble_{i}']
            self.df_ensemble = pd.concat([self.df_ensemble,df_EBitVect.T], axis=0)
        
        self.df_ensemble.columns = ['Ensemble method']
        self.df_ensemble['ID'] = self.df_ensemble.index
        
    
    def similarity_ensemble(self):
        self.mol2fp()
        plt.figure(figsize=(12,8))
        for i, j in enumerate(self.df_ensemble['Ensemble method']):
            self.data["tanimoto_morgan"] = DataStructs.BulkTanimotoSimilarity(j, self.fingerprints)
            self.plot_ap(self.data[self.active], self.data['tanimoto_morgan'], model = self.df_ensemble.iloc[i,:].ID)
            #self.plot_roc(self.data[self.active], self.data['tanimoto_morgan'], model = self.df_ensemble.iloc[i,:].ID)