In [None]:
import os
import natsort
import pandas as pd
import numpy as np
import mols2grid
from rdkit.Chem import MolFromSmiles
from rdkit.Chem import PandasTools
from rdkit.Chem import SDMolSupplier
from rdkit.Chem import AllChem
from rdkit.DataStructs import TanimotoSimilarity
from matplotlib import pyplot as plt

In [None]:
def get_cluster_dicts(df):
    
    cluster_dict = {}
    cluster_centers = {}
    for idx,row in df.iterrows():
        cluster = row['Cluster']
        name = idx
        if cluster not in cluster_dict.keys():
            cluster_dict[cluster] = []
        if name not in cluster_dict[cluster]:
            cluster_dict[cluster].append(name)

        if row['Center'] == 'Yes': cluster_centers[row['Cluster']] = idx

    return cluster_dict, cluster_centers

def get_clusters_top_hits(cluster_dict):
    
    clusters_tophits = []
    for cluster in cluster_dict.keys():
        zinc = cluster_dict[cluster][0]
        if zinc not in clusters_tophits: clusters_tophits.append(zinc)

    return clusters_tophits

def get_clusters_top5_hits(cluster_dict):
    
    clusters_top5hits = []
    for cluster in cluster_dict.keys():
        len_cluster = len(cluster_dict[cluster])
        if len_cluster <= 4:
            for i in range(len_cluster):
                zinc = cluster_dict[cluster][i]
                if zinc not in clusters_top5hist: clusters_top5hits.append(cluster_dict[cluster][i])
        else:
            for i in range(5):
                zinc = cluster_dict[cluster][i]
                if zinc not in clusters_top5hits: clusters_top5hits.append(cluster_dict[cluster][i])
    return clusters_top5hits

def get_avg_similarity(df):

    similarities = []
    df = df[~df.index.duplicated(keep='first')]
    fpgen = AllChem.GetMorganGenerator(radius=2)
    cluster_dict, cluster_centers = get_cluster_dicts(df)
    for cluster in cluster_centers.keys():
        ref_fp = fpgen.GetFingerprint(MolFromSmiles(df['SMILES'][cluster_centers[cluster]]))
        for zinc in cluster_dict[cluster]:
            fp = fpgen.GetFingerprint(MolFromSmiles(df['SMILES'][zinc]))
            tan_sim = TanimotoSimilarity(ref_fp, fp)
            distance = 1 - tan_sim
            similarities.append(1 / (1 + distance))

    
    avg_similarity = np.mean(similarities)

    print(f"Average Similarity to Cluster Center = {avg_similarity}")

    return avg_similarity

In [None]:
numbers_of_clusters = []
avg_similarities = []

print("Beginning Calculation of Cluster Distances over Numbers of Clusters...")
print("----------------------------------------------------------------------")
print()
for fn in natsort.natsorted(os.listdir()):
    
    if fn.endswith(".csv"):
        fn_info = fn.split("_")
        clustered = fn_info[1]
        num = clustered.rstrip("clustered")
        numbers_of_clusters.append(num)

        print("-----------------------------")
        print(f"# of Clusters: {num}")
        print("-----------------------------")

        df = pd.read_csv(fn, index_col="Name")
        
        avg_similarity = get_avg_similarity(df)
        avg_similarities.append(avg_similarity)

print("Completion of Average Similarity Calculations!!!")

In [None]:
plt.plot(numbers_of_clusters, avg_similarities, linewidth=3)
plt.title("Intra-Cluster Similarity with Cluster Count")
plt.xlabel("Cluster Count")
plt.ylabel("Average Distance to Cluster Center")

In [None]:
counts = pd.DataFrame({"Cluster Count": numbers_of_clusters, "Avg. Distance": avg_similarities})
counts = counts.set_index("Cluster Count")
with pd.option_context('display.max_rows', None):
    print(counts)

In [None]:
opt_df = pd.read_csv("OPTIMAL_CLUSTERS.csv", index_col="Name")
cluster_dict, cluster_centers = get_cluster_dicts(opt_df)
clusters_tophits = get_clusters_top_hits(cluster_dict)
len(clusters_tophits)

In [None]:
in_sdf = PandasTools.LoadSDF("VS_results.sdf", removeHs=False)
in_sdf = in_sdf.set_index('ID')
in_sdf = in_sdf[~in_sdf.index.duplicated(keep='first')]
slice_df = in_sdf.loc[in_sdf.index.isin(clusters_tophits)]
PandasTools.WriteSDF(slice_df, "clustered_VS_results_bestranked.sdf", properties=slice_df.columns)
print("Best of Clustered Molecules written to SDF!!!")

In [None]:
mols2grid.display("clustered_VS_results_bestranked.sdf")