## Load Embedded Data


In [14]:
import pandas
import pickle

MODEL ="dunzhang/stella_en_400M_v5" #dunzhang/stella_en_1.5B_v5
DIFFICULTY = "hard"
SAMPLE = 500

save_dir = f"embeddings/{MODEL}/{DIFFICULTY}/{SAMPLE}"

hotpot_qa_df = pandas.read_pickle(f'{save_dir}/df.pkl')

with open(f'{save_dir}/contexts.pkl', 'rb') as f:contexts = pickle.load(f)
with open(f'{save_dir}/similarity_matrix.pkl', 'rb') as f:similarity_matrix = pickle.load(f)

similarity_matrix = similarity_matrix.numpy()


# Agglomerative Clustering

In [15]:
from clustering.algorithms import Agglomerative
from clustering.analysis import ClusterAnalysis
from clustering.visualization.counts import plot_cluster_counts, plot_avg_cluster_variance,plot_graphs,plot_mean_avg_cluster_similarity

clusters  = Agglomerative(similarity_matrix,contexts,0.50)

cluster_counts_plot = plot_cluster_counts(clusters,
                                          "Agglomerative Clustering",
                                          "Cluster No.",
                                          "No. of Items in Cluster")

cluster_analysis = ClusterAnalysis(clusters,similarity_matrix)

avg_cluster_variance = cluster_analysis.get_avg_variance_for_each_cluster()
mean_avg_cluster_sim = cluster_analysis.get_mean_avg_cluster_similarity_for_each_cluster()

avg_cluster_variance_plots= plot_avg_cluster_variance(avg_variance_dict = avg_cluster_variance,
                                                      graph_name        = "Average Variance in Each Cluster",
                                                      x_label           = "Cluster No.",
                                                      y_label           = "Avg variance in Cluster")
mean_avg_cluster_sim_plots= plot_mean_avg_cluster_similarity(mean_avg_cluster_sim_dict = mean_avg_cluster_sim,
                                                             graph_name                = "Mean Average Similarity in Each Cluster",
                                                             x_label                   = "Cluster No.",
                                                             y_label                   = "Mean Avg similarity in Cluster")

plot_graphs([cluster_counts_plot,avg_cluster_variance_plots,mean_avg_cluster_sim_plots])

In [16]:
import statistics

def get_avg_cluster_variance(cluster):
    variances = []
    for i, node in enumerate(cluster):
        similarities = []
        print(f"Cosine similarity between node {node} and nodes: ")
        for j, other_node in enumerate(cluster):
            if i != j: 
                cosine_sim = similarity_matrix[i][j]
                similarities.append(cosine_sim)
                print(f"                                             {other_node} = {cosine_sim}")
        variance = statistics.variance(similarities)
        variances.append(variance)
        print(f"                             average similarity = {statistics.mean(similarities)}")
        print(f"                                       variance = {variance}")
        print()

    return statistics.mean(variances)

def get_avg_cluster_similarity(cluster:list[int]):
    avg_similarities = []
    for i, node in enumerate(cluster):
        similarities = []
        for j, other_node in enumerate(cluster):
            if i != j: 
                cosine_sim = similarity_matrix[i][j]
                similarities.append(cosine_sim)
        avg_similarities.append(statistics.mean(similarities))
    
    return statistics.mean(avg_similarities)

num = 20
cluster = clusters[num]
print(cluster,"\n")
x = get_avg_cluster_variance(cluster)
print(f"average cluster variance        = {x}")
print(f"mean average cluster similarity = {get_avg_cluster_similarity(cluster)}")

[4805, 4806, 4810, 4813] 

Cosine similarity between node 4805 and nodes: 
                                             4806 = 0.6498722434043884
                                             4810 = 0.7966527342796326
                                             4813 = 0.5945354700088501
                             average similarity = 0.6803534626960754
                                       variance = 0.010909676551818848

Cosine similarity between node 4806 and nodes: 
                                             4805 = 0.6498722434043884
                                             4810 = 0.5755731463432312
                                             4813 = 0.5900585055351257
                             average similarity = 0.6051679849624634
                                       variance = 0.0015513107646256685

Cosine similarity between node 4810 and nodes: 
                                             4805 = 0.7966527342796326
                                             4806

In [19]:
from typing import Dict
from clustering.graph.create_from_clusters import ClusterGraph
import networkx as nx

cluster_graph = ClusterGraph(clusters=clusters,similarity_matrix=similarity_matrix)

individual_cluster_graphs:Dict[int, nx.Graph] = cluster_graph.get_individual_cluster_graphs()
connected_graph:nx.Graph = cluster_graph.get_connected_graph(threshold = 0.50)

In [20]:
import plotly
from clustering.graph.visualization.graph_plots import get_individual_cluster_graph_plots, get_connected_graph_plot

graph_plots:Dict[int,plotly.graph_objs.Figure] = get_individual_cluster_graph_plots(individual_cluster_graphs,contexts)
cluster_plot:plotly.graph_objs.Figure = get_connected_graph_plot(connected_graph,clusters)

In [21]:
import pickle

with open(f'clustering/visualization/graph_plots.pkl', 'wb') as file:
    pickle.dump(graph_plots, file)

with open(f'clustering/visualization/cluster_plot.pkl', 'wb') as file:
    pickle.dump(cluster_plot, file)

with open(f'clustering/graph/connected_graph.pkl', 'wb') as file:
    pickle.dump(connected_graph, file)
    
with open(f'clustering/graph/cluster_graph.pkl', 'wb') as file:
    pickle.dump(individual_cluster_graphs, file)
# python cluster_plots.py --cluster_plot_dir="clusters/cluster_plot" --graph_plots_dir="clusters/graph_plots"

In [None]:
GROQ_API_KEY = "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"