In [None]:
import json

import pandas as pd
from sklearn import metrics
from sklearn.cluster import KMeans

In [None]:
def clustering_compute_mutual_score(name_json_file: str, target: str):
    smiles_to_remove = [
        "CCCCCCCCCCCCCC(OC(COC(CCCCCCCCCCCCC)=O)COCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOC)=O",
        "CC(C)CCC[C@@H](C)[C@H]1CC[C@@]2([H])[C@]3([H])CC=C4C[C@@H](O)CC[C@]4(C)[C@@]3([H])CC[C@]12C",
        "CCCCCCCCCCCCCCCCCCN(C)CCCCCCCCCCCCCCCCCC",
    ]

    fp_dict = json.load(open(name_json_file))
    classes = pd.read_csv("../data/iphos_multiclass.csv").set_index("m1")

    df = pd.DataFrame.from_dict(fp_dict, orient="index")
    try:
        df = df.drop(smiles_to_remove)
    except:
        pass

    df_classes = df.join(classes[[target]])

    n_clusters = len(df_classes[target].unique())

    model = KMeans(n_clusters=n_clusters, init="k-means++")

    clusters = model.fit(df_classes[[i for i in range(df.shape[1])]])

    df_classes["cluster"] = clusters.labels_

    return metrics.normalized_mutual_info_score(df_classes[target], df_classes["cluster"])

In [None]:
print(clustering_compute_mutual_score("mol2fp_cfp.json", "y1"))
print(clustering_compute_mutual_score("mol2fp_cfp.json", "family"))

print(clustering_compute_mutual_score("mol2fp_expert.json", "y1"))
print(clustering_compute_mutual_score("mol2fp_expert.json", "family"))

print(clustering_compute_mutual_score("mol2fp_grover.json", "y1"))
print(clustering_compute_mutual_score("mol2fp_grover.json", "family"))

print(clustering_compute_mutual_score("mol2fp_grover_large.json", "y1"))
print(clustering_compute_mutual_score("mol2fp_grover_large.json", "family"))

print(clustering_compute_mutual_score("mol2fp_gcn.json", "y1"))
print(clustering_compute_mutual_score("mol2fp_gcn.json", "family"))

print(clustering_compute_mutual_score("mol2fp_MegaMB_base_iphos.json", "y1"))
print(clustering_compute_mutual_score("mol2fp_MegaMB_base_iphos.json", "family"))

print(clustering_compute_mutual_score("mol2fp_MegaMB_finetuned_iphos.json", "y1"))
print(clustering_compute_mutual_score("mol2fp_MegaMB_finetuned_iphos.json", "family"))