In [8]:
import pandas as pd
import numpy as np
import hdbscan
import umap
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("ticket_kb_articles.csv")
df = df[:25000]


In [15]:
# Count occurrences of each KB Article ID
counts = df['KB Article ID'].value_counts()

# IDs split by count
ids_gt_30 = counts[counts > 30].index
ids_le_30 = counts[counts <= 30].index

# Filter original dataframe
df_gt_30 = df[df['KB Article ID'].isin(ids_gt_30)]
df_le_30 = df[df['KB Article ID'].isin(ids_le_30)]


In [11]:
text_columns= ['Title','Description']

In [17]:
df_gt_30.groupby('Knowledge Base Article').count()

Unnamed: 0_level_0,Ticket ID,Title,Description,KB Article ID,Knowledge Base Article Links
Knowledge Base Article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
400 Bad Request Error,162,162,162,162,162
AFS Retirement of Alumni/Retiree AFS Directories and Websites,46,46,46,46,46
Access a Course in My LINC,36,36,36,36,36
"Access to My LINC for Sponsored Affiliates, Incoming Students and New Hires",312,312,312,312,312
Accessing U-M Library Resources,45,45,45,45,45
...,...,...,...,...,...
"Zoom: ""Your Zoom Meeting is at Risk"" or ""Zoom Meeting Security Enhancement Required""",79,79,79,79,79
Zoom: Update Your Name,82,82,82,82,82
eRPM: Request to Reset FOA,45,45,45,45,45
eSignature: SignNow New Account Request,61,61,61,61,61


In [34]:
# Load BERT model (Sentence-BERT for semantic similarity)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for each column separately and store them
column_embeddings = {}
for col in text_columns:
    print(f"Embedding column: {col}")
    column_embeddings[col] = model.encode(df[col].astype(str).tolist(), show_progress_bar=True)
final_embeddings = np.hstack([column_embeddings[col] for col in text_columns])


Embedding column: Title


Batches: 0it [00:00, ?it/s]


Embedding column: Servicename


Batches: 0it [00:00, ?it/s]


Embedding column: Description


Batches: 0it [00:00, ?it/s]


Embedding column: Feed


Batches: 0it [00:00, ?it/s]


In [31]:
umap_model = umap.UMAP(n_neighbors=15, n_components=50, metric="cosine").fit_transform(final_embeddings)
scaler = StandardScaler()
umap_embeddings = scaler.fit_transform(umap_model)




ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=3, metric="euclidean", cluster_selection_method="eom")
cluster_labels = clusterer.fit_predict(umap_embeddings)

# Assign cluster labels to DataFrame
df["Cluster"] = cluster_labels




In [32]:
umap_2d = umap.UMAP(n_neighbors=15, n_components=2, metric="cosine").fit_transform(final_embeddings)

plt.figure(figsize=(10, 6))
plt.scatter(umap_2d[:, 0], umap_2d[:, 1], c=df["Cluster"], cmap="Spectral", s=10)
plt.colorbar(label="Cluster")
plt.title("HDBSCAN Clusters Visualization")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()




ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [33]:

num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
print(f"Number of clusters found: {num_clusters}")

print(df["Cluster"].value_counts())
for cluster in df["Cluster"].unique():
    if cluster != -1:  
        print(f"\nCluster {cluster} Examples:")
        print(df[df["Cluster"] == cluster][text_columns].sample(3))  


Number of clusters found: 212


KeyError: 'Cluster'

In [None]:
df.to_csv("output_AAUM-Login Support_with_clusters.csv", index=False)  # Save DataFrame with cluster labels to CSV file

#Run the script below to do the clusering 


In [None]:
import os
import pandas as pd
import numpy as np
import hdbscan
import umap.umap_ as umap
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

# ====== CONFIG ======
input_file = "ticket_kb_articles.csv"
output_folder = "clustered_output"  # Make sure this folder exists or use os.makedirs
os.makedirs(output_folder, exist_ok=True)

# Load data
df = pd.read_csv(input_file)
df = df[:25000]  # Optional: limit data for testing

# Count occurrences of each KB Article ID
counts = df['KB Article ID'].value_counts()
ids_gt_30 = counts[counts > 30].index
ids_le_30 = counts[counts <= 30].index

# Split the dataframe
df_gt_30 = df[df['KB Article ID'].isin(ids_gt_30)]
df_le_30 = df[df['KB Article ID'].isin(ids_le_30)]

# Text columns for embedding
text_columns = ["Title", "Description"]

# Load Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Store all clustered DataFrames
clustered_dfs = []

# Cluster each KB Article group individually
for article_id in ids_gt_30:
    group_df = df_gt_30[df_gt_30['KB Article ID'] == article_id].copy()

    try:
        # Generate embeddings for each column
        column_embeddings = {
            col: model.encode(group_df[col].astype(str).tolist(), show_progress_bar=False)
            for col in text_columns
        }

        # Concatenate embeddings
        final_embeddings = np.hstack([column_embeddings[col] for col in text_columns])

        # UMAP dimensionality reduction
        umap_model = umap.UMAP(n_neighbors=15, n_components=30, metric="cosine").fit_transform(final_embeddings)
        umap_embeddings = StandardScaler().fit_transform(umap_model)

        # Clustering with HDBSCAN
        clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=3, metric="euclidean", cluster_selection_method="eom")
        cluster_labels = clusterer.fit_predict(umap_embeddings)

        # Assign cluster labels
        group_df["Cluster"] = cluster_labels

        # Save to CSV (optional)
        output_path = os.path.join(output_folder, f"clustered_kb_{article_id}.csv")
        group_df.to_csv(output_path, index=False)

        # Collect for final merged result
        clustered_dfs.append(group_df)

        print(f"✓ Clustered KB Article ID {article_id} — Clusters: {len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)}")

    except Exception as e:
        print(f"⚠️ Error clustering KB Article ID {article_id}: {e}")
        
        # Assign Cluster = -1 to all rows in the group
        group_df["Cluster"] = -1

        # Still save the "unclustered" group for consistency
        output_path = os.path.join(output_folder, f"clustered_kb_{article_id}_error.csv")
        group_df.to_csv(output_path, index=False)

        clustered_dfs.append(group_df)


# Combine all clustered results
df_clustered_all = pd.concat(clustered_dfs, ignore_index=True)

# For df_le_30, optionally assign Cluster = -1 to mark as "unclustered"
df_le_30["Cluster"] = -1

# Final combined DataFrame
df_final = pd.concat([df_clustered_all, df_le_30], ignore_index=True)

# Save final merged CSV
final_output_path = os.path.join(output_folder, "final_clustered_dataset.csv")
df_final.to_csv(final_output_path, index=False)

print("\n✅ All clustering complete. Final dataset saved to:", final_output_path)



✓ Clustered KB Article ID 3189 — Clusters: 63




✓ Clustered KB Article ID 969 — Clusters: 5




✓ Clustered KB Article ID 10361 — Clusters: 3




✓ Clustered KB Article ID 7244 — Clusters: 4




✓ Clustered KB Article ID 7798 — Clusters: 16




✓ Clustered KB Article ID 10638 — Clusters: 12




✓ Clustered KB Article ID 1259 — Clusters: 14




✓ Clustered KB Article ID 700 — Clusters: 9




✓ Clustered KB Article ID 3131 — Clusters: 4




✓ Clustered KB Article ID 975 — Clusters: 13




✓ Clustered KB Article ID 10867 — Clusters: 9




✓ Clustered KB Article ID 713 — Clusters: 2




✓ Clustered KB Article ID 733 — Clusters: 2




✓ Clustered KB Article ID 4547 — Clusters: 10




✓ Clustered KB Article ID 248 — Clusters: 12




✓ Clustered KB Article ID 695 — Clusters: 2




✓ Clustered KB Article ID 319 — Clusters: 8




✓ Clustered KB Article ID 4599 — Clusters: 4




✓ Clustered KB Article ID 804 — Clusters: 8




✓ Clustered KB Article ID 456 — Clusters: 8




✓ Clustered KB Article ID 533 — Clusters: 7




✓ Clustered KB Article ID 1204 — Clusters: 5




✓ Clustered KB Article ID 701 — Clusters: 7




✓ Clustered KB Article ID 8761 — Clusters: 3




✓ Clustered KB Article ID 9233 — Clusters: 8




✓ Clustered KB Article ID 129 — Clusters: 7




✓ Clustered KB Article ID 6594 — Clusters: 2




✓ Clustered KB Article ID 10502 — Clusters: 7




✓ Clustered KB Article ID 538 — Clusters: 7




✓ Clustered KB Article ID 974 — Clusters: 8




✓ Clustered KB Article ID 382 — Clusters: 2




✓ Clustered KB Article ID 435 — Clusters: 5




✓ Clustered KB Article ID 65 — Clusters: 2




✓ Clustered KB Article ID 493 — Clusters: 2




✓ Clustered KB Article ID 12449 — Clusters: 2




✓ Clustered KB Article ID 82 — Clusters: 7




✓ Clustered KB Article ID 104 — Clusters: 7




✓ Clustered KB Article ID 3523 — Clusters: 2




✓ Clustered KB Article ID 1212 — Clusters: 2




✓ Clustered KB Article ID 836 — Clusters: 5




✓ Clustered KB Article ID 6627 — Clusters: 4




✓ Clustered KB Article ID 8706 — Clusters: 4




✓ Clustered KB Article ID 433 — Clusters: 4




✓ Clustered KB Article ID 118 — Clusters: 2




✓ Clustered KB Article ID 746 — Clusters: 5




✓ Clustered KB Article ID 558 — Clusters: 4




✓ Clustered KB Article ID 254 — Clusters: 6




✓ Clustered KB Article ID 4971 — Clusters: 2




✓ Clustered KB Article ID 5926 — Clusters: 2




✓ Clustered KB Article ID 3751 — Clusters: 7




✓ Clustered KB Article ID 55 — Clusters: 4




✓ Clustered KB Article ID 856 — Clusters: 3




✓ Clustered KB Article ID 6282 — Clusters: 4




✓ Clustered KB Article ID 6232 — Clusters: 2




✓ Clustered KB Article ID 750 — Clusters: 2




✓ Clustered KB Article ID 3720 — Clusters: 3




✓ Clustered KB Article ID 1856 — Clusters: 5




✓ Clustered KB Article ID 1183 — Clusters: 5




✓ Clustered KB Article ID 193 — Clusters: 6




✓ Clustered KB Article ID 822 — Clusters: 3




✓ Clustered KB Article ID 9249 — Clusters: 3




✓ Clustered KB Article ID 747 — Clusters: 2




✓ Clustered KB Article ID 6371 — Clusters: 3




✓ Clustered KB Article ID 152 — Clusters: 2




✓ Clustered KB Article ID 7335 — Clusters: 5




✓ Clustered KB Article ID 7054 — Clusters: 2




✓ Clustered KB Article ID 11073 — Clusters: 5




✓ Clustered KB Article ID 203 — Clusters: 3




✓ Clustered KB Article ID 10841 — Clusters: 2




✓ Clustered KB Article ID 6481 — Clusters: 2




✓ Clustered KB Article ID 247 — Clusters: 3




✓ Clustered KB Article ID 802 — Clusters: 2




✓ Clustered KB Article ID 324 — Clusters: 3




✓ Clustered KB Article ID 10058 — Clusters: 2




✓ Clustered KB Article ID 60 — Clusters: 3




✓ Clustered KB Article ID 6915 — Clusters: 3




✓ Clustered KB Article ID 1250 — Clusters: 2




✓ Clustered KB Article ID 4694 — Clusters: 3




✓ Clustered KB Article ID 299 — Clusters: 3




✓ Clustered KB Article ID 5885 — Clusters: 2




✓ Clustered KB Article ID 7167 — Clusters: 4




✓ Clustered KB Article ID 8820 — Clusters: 2




✓ Clustered KB Article ID 8393 — Clusters: 5




✓ Clustered KB Article ID 528 — Clusters: 3




✓ Clustered KB Article ID 10915 — Clusters: 6




✓ Clustered KB Article ID 857 — Clusters: 2




✓ Clustered KB Article ID 232 — Clusters: 3




✓ Clustered KB Article ID 9648 — Clusters: 4




✓ Clustered KB Article ID 9535 — Clusters: 2




✓ Clustered KB Article ID 10865 — Clusters: 4




✓ Clustered KB Article ID 428 — Clusters: 2




✓ Clustered KB Article ID 1005 — Clusters: 2




✓ Clustered KB Article ID 159 — Clusters: 3




✓ Clustered KB Article ID 471 — Clusters: 3




✓ Clustered KB Article ID 8986 — Clusters: 3




✓ Clustered KB Article ID 4785 — Clusters: 3




✓ Clustered KB Article ID 11065 — Clusters: 2




✓ Clustered KB Article ID 6835 — Clusters: 3




✓ Clustered KB Article ID 430 — Clusters: 2




✓ Clustered KB Article ID 336 — Clusters: 2




✓ Clustered KB Article ID 1229 — Clusters: 3




✓ Clustered KB Article ID 4138 — Clusters: 2




✓ Clustered KB Article ID 1178 — Clusters: 3




✓ Clustered KB Article ID 9279 — Clusters: 3




✓ Clustered KB Article ID 4702 — Clusters: 2




✓ Clustered KB Article ID 1219 — Clusters: 2




✓ Clustered KB Article ID 8670 — Clusters: 2




✓ Clustered KB Article ID 1226 — Clusters: 2




✓ Clustered KB Article ID 12242 — Clusters: 3




✓ Clustered KB Article ID 190 — Clusters: 2




✓ Clustered KB Article ID 103 — Clusters: 2




✓ Clustered KB Article ID 71 — Clusters: 3




✓ Clustered KB Article ID 1209 — Clusters: 2




✓ Clustered KB Article ID 4190 — Clusters: 2




✓ Clustered KB Article ID 10898 — Clusters: 3




✓ Clustered KB Article ID 9191 — Clusters: 3




✓ Clustered KB Article ID 2846 — Clusters: 2




✓ Clustered KB Article ID 5815 — Clusters: 2




✓ Clustered KB Article ID 1157 — Clusters: 2




✓ Clustered KB Article ID 2881 — Clusters: 3




✓ Clustered KB Article ID 176 — Clusters: 2




✓ Clustered KB Article ID 11490 — Clusters: 2




✓ Clustered KB Article ID 311 — Clusters: 3




✓ Clustered KB Article ID 6410 — Clusters: 2




✓ Clustered KB Article ID 3686 — Clusters: 2




✓ Clustered KB Article ID 461 — Clusters: 3




✓ Clustered KB Article ID 4578 — Clusters: 3




✓ Clustered KB Article ID 3957 — Clusters: 3




✓ Clustered KB Article ID 10647 — Clusters: 2




✓ Clustered KB Article ID 4136 — Clusters: 2




✓ Clustered KB Article ID 7041 — Clusters: 2




✓ Clustered KB Article ID 2866 — Clusters: 2




✓ Clustered KB Article ID 9017 — Clusters: 2




✓ Clustered KB Article ID 133 — Clusters: 2




✓ Clustered KB Article ID 58 — Clusters: 3




✓ Clustered KB Article ID 8992 — Clusters: 2




✓ Clustered KB Article ID 8411 — Clusters: 2




✓ Clustered KB Article ID 1416 — Clusters: 2




✓ Clustered KB Article ID 6732 — Clusters: 2




✓ Clustered KB Article ID 1274 — Clusters: 2




✓ Clustered KB Article ID 124 — Clusters: 2




✓ Clustered KB Article ID 5890 — Clusters: 2




✓ Clustered KB Article ID 3405 — Clusters: 2




✓ Clustered KB Article ID 982 — Clusters: 2




✓ Clustered KB Article ID 8100 — Clusters: 2




✓ Clustered KB Article ID 862 — Clusters: 2




✓ Clustered KB Article ID 6485 — Clusters: 2




✓ Clustered KB Article ID 4808 — Clusters: 2




✓ Clustered KB Article ID 1247 — Clusters: 2




✓ Clustered KB Article ID 138 — Clusters: 2




✓ Clustered KB Article ID 1258 — Clusters: 2




✓ Clustered KB Article ID 3566 — Clusters: 2


  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(


⚠️ Error clustering KB Article ID 6445: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.


  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_le_30["Cluster"] = -1


⚠️ Error clustering KB Article ID 6078: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.

✅ All clustering complete. Final dataset saved to: clustered_output\final_clustered_dataset.csv


In [10]:
df.shape # Display the first few rows of the final DataFrame

(25000, 6)

In [11]:
# Assuming `df_final` is your final DataFrame with a "Cluster" column
df_final = pd.read_csv('final_clustered_dataset.csv')
# 1. Keep all rows where Cluster == -1
df_unclustered = df_final[df_final["Cluster"] == -1]

# 2. For clustered rows (Cluster != -1), sample 2 per cluster
df_clustered = df_final[df_final["Cluster"] != -1]
df_sampled = df_clustered.groupby("Cluster").sample(n=2, random_state=42, replace=False)

# 3. Combine both
df_subset = pd.concat([df_unclustered, df_sampled], ignore_index=True)

# Optional: shuffle the final DataFrame
df_subset = df_subset.sample(frac=1, random_state=42).reset_index(drop=True)

# Save or use as needed
df_subset.to_csv("subset_sampled_clusters.csv", index=False)


In [12]:
df_subset.shape

(11766, 7)