<a href="https://colab.research.google.com/github/SZAftabi/UseRQE/blob/main/Step1_TagClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center> <font size='6'> 💟 <b> UseRQE </b> 💟 </font> </center>
<br> <center>Recognizing Question Entailment with User Background-knowledge Modeling
<br> <font color='red' size='4'> <b> Step (1) </b> Hierarchical tag clustering </font> </center>

# 🌞 **mount the drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
Drive_path = "/content/drive/MyDrive/"

# 🌞 **1. requirements**

In [None]:
!git clone -q https://github.com/rapidsai/rapidsai-csp-utils.git                   # Fast t-SNE
!python rapidsai-csp-utils/colab/pip-install.py
!pip install -q gensim
!pip install -q node2vec
!pip install -q networkx
!pip install -q scikit-learn-extra
!pip install -q transformers

In [None]:
import os
import time
import cudf
import cuml
import copy
import random
import pickle
import gensim
import numpy as np
import pandas as pd
import networkx as nx
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import gensim.downloader as api
import plotly.graph_objects as go
import matplotlib.pyplot as plt


cuml.__version__
from textblob import Word
from node2vec import Node2Vec
from cuml.manifold import TSNE
from collections import Counter
from collections import OrderedDict
from scipy.spatial.distance import cdist
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from itertools import repeat
from sentence_transformers import SentenceTransformer


from scipy.cluster.hierarchy import (
    dendrogram,
    linkage,
    fcluster
    )
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score
    )
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support
    )
from transformers import (
    BertModel,
    AutoModel,
    BertTokenizer,
    AutoTokenizer
    )

import torch
torch.cuda.is_available()

import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet

In [None]:
!pip install -q git+https://github.com/VenkateshwaranB/stellargraph.git
import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UniformRandomWalk
from stellargraph.data import BiasedRandomWalk
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.layer import Node2Vec, link_classification
from stellargraph import globalvar
from stellargraph import StellarGraph

import tensorflow as tf
from tensorflow import keras

#🌞 **2. load the data**

In [None]:
data_path = Drive_path + "TG_Data_2048_All.pkl"
MyData = pd.read_pickle(data_path)
tags_series = MyData['tags']
tags_lists = tags_series.str.split(',').apply(lambda tags: [t.strip() for t in tags])
all_posts = tags_lists.tolist()
print("Number of posts : ", len(all_posts))

**Select a subset of dataset**

if you want to select a subset of data please determine a number, else use 'all'


In [None]:
num_samples = 'all'
# num_samples = 1000
posts = all_posts[:num_samples] if num_samples != 'all' else  all_posts.copy()

**extract unique tags from the posts**

In [None]:
tags = list(OrderedDict.fromkeys(tag for post in posts for tag in post))
print("Number of unique tags: ", len(tags))

**statistics about dataset**

In [None]:
tag_frequency = Counter(tag for post in posts for tag in post)                  # Step 1: Create a dictionary to store the frequency of each tag
tag_frequency_dict = dict(tag_frequency)                                        # Step 2: Count the frequency of each tag across all posts

tag_counts = [len(post) for post in posts]                                      # Step 3: Count the number of tags for each post
posts_per_tag_count = Counter(tag_counts)                                       # Step 4: Create a dictionary to count the number of posts with a specific number of tags
num_tags = list(posts_per_tag_count.keys())
num_posts = list(posts_per_tag_count.values())

#🌞 **3. co-occurrence matrix**

In [None]:
posts_0 = copy.deepcopy(posts)
tags_0 = copy.deepcopy(tags)

In [None]:
co_occurrence_matrix = np.zeros((len(tags_0), len(tags_0)))                     # Create an empty co-occurrence matrix

for post in posts_0:
    post_tags = set(post)                                                       # Extract tags from the post
    for tag1 in post_tags:                                                      # Iterate through pairs of tags in the post
        for tag2 in post_tags:
            if tag1 != tag2:
                index1 = tags_0.index(tag1)                                     # Find the indices of tag1 and tag2 in the 'tags' list
                index2 = tags_0.index(tag2)
                co_occurrence_matrix[index1, index2] += 1                       # Update the co-occurrence matrix
                co_occurrence_matrix[index2, index1] += 1

In [None]:
display(co_occurrence_matrix[0:10, 0:10])
print(co_occurrence_matrix.shape)

co_occurrence_df = pd.DataFrame(co_occurrence_matrix)
co_occurrence_matrix_file = Drive_path + "co_occurrence_matrix.pkl"
co_occurrence_df.to_pickle(co_occurrence_matrix_file)

print(f"{co_occurrence_matrix_file} saved successfully.")

**you can load a saved co-occurrence matrix:**

In [None]:
co_occurrence_matrix_file = Drive_path + "co_occurrence_matrix.pkl"
co_occurrence_df = pd.read_pickle(co_occurrence_matrix_file)
co_occurrence_matrix = co_occurrence_df.to_numpy()

#🌞 **4. graph construction**
(NetworkX)

In [None]:
G = nx.Graph()

# for tag in tags:                                                                # Add nodes for each tag (including isolate nodes)
#     G.add_node(tag)

for i in range(len(tags_0)):
    for j in range(i + 1, len(tags_0)):
        if co_occurrence_matrix[i][j] > 0:
            tag1 = tags[i]
            tag2 = tags[j]
            weight = co_occurrence_matrix[i][j]
            G.add_edge(tag1, tag2, weight=weight)

#🌞 **5. node embedding**
(Node2vec)

**Hyper-parameters**

1. `Walk Length:` This parameter controls how long the random walk should be for each node. Longer walks can capture more global structure, but might also introduce noise. Start with a moderate value (e.g., 30) and adjust based on results. *(Range: 10 to 80)*

2. `Number of Walks: `The number of random walks to perform from each node. A higher number captures more information but increases computation. Start with a moderate value (e.g., 10) and adjust as needed. *(Range: 5 to 20)*

3. `Embedding Dimensions (dimensions):` The size of the embedding vectors. Typically, values between 64 and 128 work well for many tasks. You can experiment with different dimensions to see what works best. *(Range: 64 to 128)*

4. `Window Size (window):` The window size during the Skip-gram training. This parameter depends on the size of your graph and the level of granularity you want. Start with a reasonable value (e.g., 10) and adjust based on your results. *(Range: 5 to 20)*

5. `min_count:` Ignores all nodes with a total frequency lower than this value. Nodes with a frequency less than min_count are not considered during training. A common starting value is around 5 or 10. You can increase it if you want to filter out rare nodes, or decrease it if you want to include more nodes in the vocabulary.

6. `num_workers:` The number of CPU cores to use for training. Setting this to a higher value can speed up training on multi-core machines.

7. `sg:` This parameter specifies the training algorithm. Use sg=1 for *Skip-gram* and sg=0 for *CBOW (Continuous Bag of Words)*.

8. `p`: The likelihood of backtracking the walk and immediately revisiting a node in the walk is controlled by the return parameter p. Setting a *high value* to parameter p ensures *lower chances of revisiting* a node and *avoids 2-hop redundancy* in sampling. This strategy also encourages moderate graph exploration. On the other hand, if the value of the p parameter is *low*, the chances of backtracking in the walk are *higher*, keeping the random walk *closer to the starting node*.

9. `q`: The inOut parameter q allows the traversal calculation to *differentiate between inward and outward nodes*. Setting a *high value* to parameter q (q > 1) biases the random walk to move *towards nodes close to the node in the previous step*.

In [None]:
walk_length = 10
num_walks = 60
dimensions = 128
window = 10
min_count = 1
num_workers = 4
sg = 1
epochs = 20
alpha = 1e-3
p = 1
q = 0.5
seed = 42

Create a Node2Vec instance, and then, train the Word2Vec model to obtain node embeddings

In [None]:
node2vec_instance = Node2Vec(
    G,
    dimensions=dimensions,
    walk_length=walk_length ,
    num_walks=num_walks,
    workers=num_workers,
    p = p,
    q = q,
    seed = seed,
    weight_key = 'weight'
  )
model = node2vec_instance.fit(
    window=window,
    min_count=min_count,
    sg=sg,
    compute_loss=True,
    epochs = epochs,
    alpha = alpha,
    batch_words=4,
  )

tag_embeddings = {node: model.wv[node] for node in G.nodes()}
tag_names = list(tag_embeddings.keys())
Node2vec_embeddings = np.array(list(tag_embeddings.values()))

save node embeddings and tag names

In [None]:
print(len(tag_names))

model.wv.save_word2vec_format(f"{Drive_path}UseRQE/TG/n2v_old_tag_embeddings_(withoutiso)")
model.save(f"{Drive_path}UseRQE/TG/Node2vec_model")


Node2vec_embeddings_pckl = pd.DataFrame(Node2vec_embeddings)
Node2vec_file_name = f"{Drive_path}UseRQE/TG/n2v_old_embeddings_(withoutiso).pkl"
Node2vec_embeddings_pckl.to_pickle(Node2vec_file_name)


tag_names_pckl = pd.DataFrame(tag_names)
tag_names_file_name = f"{Drive_path}UseRQE/TG/n2v_old_tag_names_(withoutiso).pkl"
tag_names_pckl.to_pickle(tag_names_file_name)

print(f"{Node2vec_file_name} saved successfully.")
print(f"{tag_names_file_name} saved successfully.")

**check if embeddings are well-generated**

In [None]:
edges = list(G.edges())                                                         # Step1: Generate positive and negative samples for training
non_edges = [
    (i, j)
    for i in list(G.nodes())
    for j in list(G.nodes())
    if not G.has_edge(i, j)
]
positive_samples = [
    (tag_embeddings[i], tag_embeddings[j], 1)
    for i, j in edges]
negative_samples = [
    (tag_embeddings[i], tag_embeddings[j], 0)
    for i, j in non_edges]


all_samples2 = positive_samples + negative_samples[:296748]                     # Step2: Combine positive and negative samples
np.random.shuffle(all_samples2)


X = np.array([(np.concatenate((i, j))) for i, j, _ in all_samples2])            # Step3: Split the data into training and testing sets
y = np.array([label for _, _, label in all_samples2])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


classifier = LogisticRegression(max_iter=1000)                                  # Step4: Train a logistic regression classifier for edge prediction
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)                                        # Step5: Make predictions on the test set
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy for edge prediction: {accuracy:.2f}")
print(precision_recall_fscore_support(y_test, predictions))

In [None]:
Node2vec_file_name = f"{Drive_path}UseRQE/TG/n2v_old_embeddings_(withoutiso).pkl"
Node2vec_embeddings_pckl = pd.read_pickle(Node2vec_file_name)
Node2vec_embeddings = Node2vec_embeddings_pckl.to_numpy()

In [None]:
node_embeddings = Node2vec_embeddings
node_embeddings.shape

#🌞 **6. hierarchical clustering**
(sklearn.Agglomerative)

**Hyper-parameters**

1. `n_clusters`: The number of clusters to form. If None, it will be inferred based on other parameters, such as the dist_threshold. Indeed, the algorithm will continue merging clusters until a certain linkage distance threshold is reached.

2. `metric`: The distance metric used to compute the linkage between clusters. This determines how the distance between clusters is measured.Various distance metrics are available, including Euclidean distance *euclidean*, *Manhattan* distance, *cosine* similarity, etc. 'cosine' is a common choice for text and high-dimensional data.

3. `dist_threshold`: The linkage distance threshold above which clusters will not be merged. Clusters with distances greater than or equal to this threshold will be treated as separate clusters.This is a float value between 0 and 1.

4. `linkage`: The linkage criterion used to compute the distance between newly formed clusters during each merge. It determines how cluster distances are calculated. Various linkage methods are available, including *ward*, *complete*, *average*, *single*, etc. 'average' is a common choice and works well in many cases. 'ward' is another popular option for Euclidean distance-based clustering.

5. `full_tree`: If *True*, the algorithm computes the full hierarchy of clusters, allowing you to extract clusters at various distances. If *False*, it stops when n_clusters clusters are reached. Use True when you want to analyze the complete hierarchy of clusters. Use False when you have a specific number of clusters in mind and want the algorithm to stop once that number is reached.

**Functions**

In [None]:
def hierarchical_clustering(embeddings, n_clusters, metric, dist_threshold, linkage, full_tree):
    agglomerative_cluster = AgglomerativeClustering(
        n_clusters=None,
        metric=metric,
        distance_threshold=dist_threshold,
        linkage=linkage,
        compute_full_tree = full_tree
    )
    pr = agglomerative_cluster.fit_predict(embeddings)
    model = agglomerative_cluster.fit(embeddings)
    return pr, model

In [None]:
def perform_tsne(embeddings, perplexity):                                       # Perform t-SNE for dimensionality reduction
    tsne = TSNE(
        n_components=2,
        random_state=42,
        perplexity=perplexity,
        n_iter=10000,
        learning_rate = 300.0,
        n_iter_without_progress = 3000,
        early_exaggeration = 50,
        method = 'barnes_hut',
        # learning_rate_method = None
        )
    return tsne.fit_transform(embeddings)

In [None]:
def extract_cluster_statistics(embeddings_2d, cluster_labels, tags, tag_frequency_dict):
    data = pd.DataFrame(columns=['X', 'Y', 'Cluster', 'Tag'])                   # Create an empty DataFrame to store data
    cluster_means, cluster_variances, cluster_sizes = [], [], []                # Initialize lists to store cluster statistics
    representative_samples, representative_samples_freq = {}, {}                # Initialize a dictionary to store the representative sample for each cluster
    representative_samples_freq_2nd = {}
    representative_samples_freq_3nd = {}

    for cluster_label in np.unique(cluster_labels):
        samples_tags = [tags[i]
                        for i, lbl in enumerate(cluster_labels)
                        if lbl == cluster_label]
        indexes = [i
                   for i, lbl in enumerate(cluster_labels)
                   if lbl == cluster_label]
        samples_2d = embeddings_2d[cluster_labels == cluster_label]

        cluster_center = np.mean(samples_2d, axis=0)
        distances = cdist(samples_2d, [cluster_center])
        nearest_sample_idx = np.argmin(distances)
        representative_samples[cluster_label] = samples_tags[nearest_sample_idx]
        print(samples_tags)

        representative_samples_freq[cluster_label] = max(
            samples_tags,
            key=lambda tag: tag_frequency_dict.get(tag, 0))
        mostfreq_sample_idx = samples_tags.index(representative_samples_freq[cluster_label])


        samples_tags_copy = copy.deepcopy(samples_tags)
        samples_tags_copy.remove(representative_samples_freq[cluster_label])
        representative_samples_freq_2nd[cluster_label] = max(
            samples_tags_copy,
            key=lambda tag: tag_frequency_dict.get(tag, 0))
        mostfreq_sample_idx_2nd = samples_tags.index(representative_samples_freq_2nd[cluster_label])


        samples_tags_copy.remove(representative_samples_freq_2nd[cluster_label])
        representative_samples_freq_3nd[cluster_label] = max(
            samples_tags_copy,
            key=lambda tag: tag_frequency_dict.get(tag, 0))
        mostfreq_sample_idx_3nd = samples_tags.index(representative_samples_freq_3nd[cluster_label])



        distances_to_representative = cdist(samples_2d, [samples_2d[nearest_sample_idx]])
        mean_distance = np.mean(distances_to_representative)
        variance_distance = np.var(distances_to_representative)
        cluster_size = len(samples_tags)
        cluster_means.append(mean_distance)
        cluster_variances.append(variance_distance)
        cluster_sizes.append(cluster_size)

        cluster_data = pd.DataFrame(                                            # Add data for the current cluster to the DataFrame
            {
                'X': samples_2d[:, 0],
                'Y': samples_2d[:, 1],
                'Cluster': samples_tags[mostfreq_sample_idx],
                'Tag': samples_tags,
                'Indexes': indexes
            }
        )
        data = pd.concat([data, cluster_data])


    cluster_stats = pd.DataFrame({                                              # Create a new DataFrame for cluster statistics
        'Cluster': np.unique(cluster_labels),
        'Representative sample': [representative_samples[cluster] for cluster in np.unique(cluster_labels)],
        'representative_samples_freq': [representative_samples_freq[cluster] for cluster in np.unique(cluster_labels)],
        'representative_samples_freq_2nd': [representative_samples_freq_2nd[cluster] for cluster in np.unique(cluster_labels)],
        'representative_samples_freq_3nd': [representative_samples_freq_3nd[cluster] for cluster in np.unique(cluster_labels)],
        'Mean_Distance': cluster_means,
        'Variance_Distance': cluster_variances,
        'Cluster_Size': cluster_sizes,
    }).set_index('Cluster', drop=True)


    silhouette_metric = silhouette_score(
        embeddings_2d,
        cluster_labels
        )
    calinski_harabasz_metric = calinski_harabasz_score(
        embeddings_2d,
        cluster_labels
        )
    davies_bouldin_metric = davies_bouldin_score(
        embeddings_2d,
        cluster_labels
        )
    return cluster_stats, silhouette_metric, calinski_harabasz_metric, davies_bouldin_metric, data, len(np.unique(cluster_labels))

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from matplotlib import cm
from matplotlib.colors import to_hex

def generate_distinct_colors(n_clusters):                                       # Generate a distinct color palette
    cmap = cm.get_cmap('tab20', n_clusters)                                     # 'tab20' or 'tab20c' are good options for distinct colors
    return [to_hex(cmap(i)) for i in range(cmap.N)]

In [None]:
def visualize_clusters(data, representative_samples, custom_color_scale):       # Create an interactive scatter plot
    fig = px.scatter(
        data,
        x='X',
        y='Y',
        color= 'Cluster',
        hover_data=['Tag'],
        labels={'X': 'Dimension 1', 'Y': 'Dimension 2'},
        # color_continuous_scale=custom_color_scale
        color_discrete_sequence=custom_color_scale,
    )
    fig.update_traces(
        marker=dict(size=2),
        selector=dict(mode='markers+text')
    )
    fig.update_layout(showlegend=False)


    # Loop over representative samples and assign unique cluster IDs
    for counter, (cluster_label, sample_tag) in enumerate(representative_samples.items()):
        representative_sample = data[data['Tag'] == sample_tag]
        cluster_color = custom_color_scale[counter % len(custom_color_scale)]

        fig.add_trace(go.Scatter(
            x=[representative_sample['X'].values[0]],
            y=[representative_sample['Y'].values[0]],
            mode="markers",
            marker=dict(
                size=5,
                color='white',
                line=dict(width=2, color=cluster_color)
            ),
            showlegend=False,
            hoverinfo="text"
        ))

    fig.update_layout(
        width=1000,
        height=800,
        plot_bgcolor='rgba(255,255,255,255)'
    )
    fig.update_xaxes({'gridcolor': 'lightgray', 'zerolinecolor': 'lightgray'})
    fig.update_yaxes({'gridcolor': 'lightgray', 'zerolinecolor': 'lightgray'})

    return fig

**Dendogram**

In [None]:
n_clusters=None
metric= 'euclidean'
linkage_= 'ward'
full_tree = True

In [None]:
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1                                              # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    dendrogram(linkage_matrix, **kwargs)                                        # Plot the corresponding dendrogram

    return linkage_matrix

In [None]:
_, model_Agg = hierarchical_clustering(                                         # Perform hierarchical clustering
        embeddings = node_embeddings,
        n_clusters = n_clusters,
        metric = metric,
        dist_threshold = 0,                                                     # setting distance_threshold=0 ensures we compute the full tree.
        linkage = linkage_,
        full_tree = full_tree
    )

plt.figure(figsize=(10, 8))
plt.title("Hierarchical Clustering Dendrogram")
linkage_matrix = plot_dendrogram(model_Agg)
plt.xticks([])
plt.xlabel('')
plt.savefig(f"{Drive_path}UseRQE/TG/n2v_old_Dendogram.png", dpi=300)
plt.show()

Now define thresholds based on the above dendogram

In [None]:
node_embeddings=np.array(node_embeddings)
node_embeddings

In [None]:
pd.options.display.max_rows = 200
thresholds = [32, 16, 8]                                                        # Define a list of threshold values for the three levels

all_cluster_labels, representative_samples, Whole_data = [], {}, []             # Create a list to store cluster labels at each level
                                                                                # and a dictionary to store representative sample names

embeddings_2d = perform_tsne(                                                   # Perform t-SNE
        embeddings=node_embeddings,
        perplexity=60
    )


for level, threshold in enumerate(thresholds):
    print(f"Threshold: {threshold}")


    cluster_labels = fcluster(                                                  # Cut the dendrogram into clusters at the current threshold
        linkage_matrix,
        t=threshold,
        criterion='distance'
        )
    all_cluster_labels.append(cluster_labels)

    cluster_stats, silhouette_metric,\
     calinski_harabasz_metric, davies_bouldin_metric,\
      data, num_classes = extract_cluster_statistics(                           # Extract cluster statistics
        embeddings_2d=embeddings_2d,
        cluster_labels=cluster_labels,
        tags=list(G.nodes()),
        tag_frequency_dict=tag_frequency_dict
    )

    Whole_data.append(data)
    for c_id, rep_name in cluster_stats['representative_samples_freq'].items():       # Store representative sample names with level information
        if level == 1:
          if any(
              value == rep_name
              for (key_level, _), value in representative_samples.items()
              if key_level == level - 1
          ):
              representative_samples[(level, c_id)] = cluster_stats['representative_samples_freq_2nd'][c_id]
          else:
              representative_samples[(level, c_id)] = rep_name
        elif level == 2:
            if any(
                value == rep_name
                for (key_level, _), value in representative_samples.items()
                if key_level == level - 1
            ):
                representative_samples[(level, c_id)] = cluster_stats['representative_samples_freq_3nd'][c_id]
            elif any(
                value == rep_name
                for (key_level, _), value in representative_samples.items()
                if key_level == level - 2
            ):
                representative_samples[(level, c_id)] = cluster_stats['representative_samples_freq_3nd'][c_id]
            else:
                representative_samples[(level, c_id)] = rep_name
        else:
            representative_samples[(level, c_id)] = rep_name


    print("Cluster Statistics:")
    display(cluster_stats)
    print("Silhouette Metric:", silhouette_metric)
    print("Calinski Harabasz Metric:", calinski_harabasz_metric)
    print("Davies Bouldin Metric:", davies_bouldin_metric)


    # Visualize clusters
    # color palette (1)
    custom_color_scale = px.colors.sample_colorscale("Turbo", [n/num_classes for n in range(num_classes)])
    fig = visualize_clusters(
        data=data,
        representative_samples=cluster_stats['representative_samples_freq'].to_dict(),
        custom_color_scale=custom_color_scale
    )
    file_path = f"{Drive_path}UseRQE/TG/n2v_old_clustering_level{level}-a.html"
    pio.write_html(fig, file_path)
    fig.show()


    # color palette (2)
    custom_color_scale = px.colors.sequential.Turbo
    fig = visualize_clusters(
        data=data,
        representative_samples=cluster_stats['representative_samples_freq'].to_dict(),
        custom_color_scale=custom_color_scale
    )
    file_path = f"{Drive_path}UseRQE/TG/n2v_old_clustering_level{level}-b.html"
    pio.write_html(fig, file_path)
    fig.show()


    # color palette (3)
    custom_color_scale = generate_distinct_colors(num_classes)
    fig = visualize_clusters(
        data=data,
        representative_samples=cluster_stats['representative_samples_freq'].to_dict(),
        custom_color_scale=custom_color_scale
    )
    file_path = f"{Drive_path}UseRQE/TG/n2v_old_clustering_level{level}-c.html"
    pio.write_html(fig, file_path)
    fig.show()

In [None]:
hierarchical_data = []
for tag_idx, tag in enumerate(list(G.nodes())):
    level_1_cluster = representative_samples.get((0, all_cluster_labels[0][tag_idx]))
    level_2_cluster = representative_samples.get((1, all_cluster_labels[1][tag_idx]))
    level_3_cluster = representative_samples.get((2, all_cluster_labels[2][tag_idx]))
    hierarchical_data.append([level_1_cluster, level_2_cluster, level_3_cluster, tag])

print("Hierarchical DataFrame:")
hierarchical_df = pd.DataFrame(
    hierarchical_data,
    columns=["Level 1", "Level 2", "Level 3", "Tag"]
    )
display(hierarchical_df)

In [None]:
hierarchical_df_file_name = f"{Drive_path}UseRQE/TG/n2v_old_hierarchical_df.pkl"
hierarchical_df.to_pickle(hierarchical_df_file_name)

In [None]:
hierarchical_df_file_name = f"{Drive_path}UseRQE/TG/n2v_old_hierarchical_df.pkl"
hierarchical_df = pd.read_pickle(hierarchical_df_file_name)
hierarchical_df

#🌞 **7. refine the TG-Data**
applying hierarchical clustering on the TG data

In [None]:
data_path_LLama = f"/content/drive/MyDrive/RQE_Data_With_Both_uesrid.pkl"
MyData_LLama = pd.read_pickle(data_path_LLama)
MyData_LLama.loc[(MyData_LLama['body_Q2']=='') & (MyData_LLama['userid_Q2']=='65001'), 'body_Q2']='So when I launch Minecraft, before it finishes loading, it crashes. I do not understand what is going on. Could someone help me? Here is my crash report:'
MyData_LLama.loc[(MyData_LLama['body_Q2']=='') & (MyData_LLama['userid_Q2']=='36896'), 'body_Q2']='How do I type the infinity symbol in MacTex'
MyData_LLama.loc[(MyData_LLama['body_Q2']=='') & (MyData_LLama['userid_Q2']=='3031'), 'body_Q2']='Run time error for GP objects'
MyData_LLama.loc[(MyData_LLama['body_Q1']=='') & (MyData_LLama['userid_Q2']=='65001'), 'body_Q1']='Misplaced allignment tab character line 53'
MyData_LLama.loc[(MyData_LLama['body_Q1']=='') & (MyData_LLama['userid_Q2']=='16188'), 'body_Q1']='How to Export this animation as a gif file for powerpoint presentation'
MyData_LLama.loc[(MyData_LLama['body_Q1']=='') & (MyData_LLama['userid_Q2']=='24829'), 'body_Q1']='why does rotation style work on actual coordinates and not variables in tikz 3d plot'

MyData_LLama.loc[(MyData_LLama['body_Q2']=='') & (MyData_LLama['userid_Q2']=='50615'), 'body_Q2']='How set a table in margin'
MyData_LLama.loc[(MyData_LLama['body_Q2']=='') & (MyData_LLama['userid_Q2']=='23835'), 'body_Q2']='Latex equation positioning problem'
MyData_LLama.loc[(MyData_LLama['body_Q2']=='') & (MyData_LLama['userid_Q2']=='14524'), 'body_Q2']='Chapter comment with regulation'
MyData_LLama.loc[(MyData_LLama['body_Q2']=='') & (MyData_LLama['userid_Q2']=='50823'), 'body_Q2']='minipage goes beyond right margin'

TG_Data = MyData_LLama[['body_Q1', 'tags_Q1']]
TG_Data = TG_Data.rename(columns={'body_Q1': 'text', 'tags_Q1': 'tags'})

TG_Data2 = pd.read_pickle(f"{Drive_path}TG_Data (1).pkl")
TG_Data2 = TG_Data2.reset_index(drop=True)
TG_Data['tags'] = TG_Data2['tags']
display(TG_Data)

In [None]:
tag_mapping = {}                                                                # Step1: Create a mapping dictionary from hierarchical_df
for index, row in hierarchical_df.iterrows():
    tags = [row['Level 1'], row['Level 2'], row['Level 3']]
    tag_mapping[row['Tag']] = ', '.join(tags)

result_tags_list = []                                                           # Step2: Process tags in TG_Data
oldtags_list = []
tags_with_duplicates_list = []

for tags_str in TG_Data['tags']:
    tags = tags_str.split(', ')
    processed_tags = [
        tag_mapping.get(tag, tag) if len(tag_mapping.get(tag, tag)) != 0
        else tag
        for tag in tags
    ]
    result_tags_list.append(', '.join(processed_tags))
    oldtags_list.append(', '.join(tags))
    tags_with_duplicates_list.append(', '.join(processed_tags))


result_dataframe = pd.DataFrame({                                               # Step3: Create result_dataframe with sorted newtags, tag frequencies, oldtags, and tags_with_duplicates
    'text': TG_Data['text'],
    'oldtags': oldtags_list,
    'newtags': result_tags_list,
    'tags_with_duplicates': tags_with_duplicates_list
})

tag_frequencies_list = [                                                        # Step4: Calculate tag frequencies using Counter
    dict(Counter(tags.split(', ')))
    for tags in result_tags_list
]
result_dataframe['tag_frequencies'] = tag_frequencies_list


result_dataframe['newtags'] = result_dataframe.apply(                           # Step5: Sort newtags based on frequencies, with the condition to maintain the original order
    lambda row: ', '.join(
        sorted(
            set(row['newtags'].split(', ')),
            key=lambda tag: (
                row['tag_frequencies'].get(tag, 0),
                -row['newtags'].split(', ').index(tag)
            ),
            reverse=True
        )
    ),
    axis=1
)

result_dataframe = result_dataframe[[                                           # Step6: Reorder the columns as per the desired output
    'text',
    'oldtags',
    'newtags',
    'tags_with_duplicates',
    'tag_frequencies'
]]
TG_Data_After_HieClustering_file_name = f"{Drive_path}UseRQE/TG/n2v_old_TG_Data_After_HieClustering.pkl"
result_dataframe.to_pickle(TG_Data_After_HieClustering_file_name)
display(result_dataframe)


if you want to load from a pre-saved TG-Data after hierarchical clustering

In [None]:
TG_Data_After_HieClustering = pd.read_pickle(f"{Drive_path}UseRQE/TG/n2v_old_TG_Data_After_HieClustering.pkl")
display(TG_Data_After_HieClustering)