In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import joblib
import duckdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
import umap
import nltk
import re
from sentence_transformers import SentenceTransformer

# Import from local modules
import sys
sys.path.append('../src')
import clustering_analysis
import processing_and_visualization
import importlib
importlib.reload(clustering_analysis)
importlib.reload(processing_and_visualization)

from cleaning import minimal_clean
from clustering_analysis import cluster_with_umap_hdbscan, recluster_noise, print_cluster_examples, summarize_clusters, metacluster_preview
from processing_and_visualization import quick_save_file, save_topic_files, save_question_clusters

# Download necessary NLTK resources
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

# Path for 1 million English sample questions and saved results
load_dotenv()
unlabeled_path = os.getenv("DATA_UNLABELED")
unlabeled_embeddings_path = os.getenv("DATA_UNLABELED_EMBEDDINGS")
data_dir = os.getenv("DATA_DIR")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Unlabeled Questions
Separate notebook dedicated to clustering questions without a pre-existing 'question_topic' label

In [197]:
# load data
questions = pd.read_parquet(unlabeled_path)

questions = questions.drop(['response_user_type', 'response_user_status',
       'response_user_country_code', 'response_user_gender',
       'response_user_dob', 'response_user_created_at', 'response_id',
       'response_user_id', 'response_language', 'response_content',
       'response_topic', 'response_sent'], axis=1)

questions.shape

(643148, 12)

In [102]:
unlabeled_df.shape

(643148, 17)

In [198]:
# Apply separate cleaning function that just normalizes spacing and removes very common "Q" boilerplate
questions['Q_basic_clean'] = questions['question_content'].apply(minimal_clean)

In [None]:
# Check if embeddings file exists before generating

if os.path.exists(unlabeled_embeddings_path):
    print("Embedding file exists — loading from disk.")
    data = np.load(unlabeled_embeddings_path, allow_pickle=True)
    embeddings = data["embeddings"]
    question_ids = data["question_ids"]  # shouldn't be necessary, but saved just in case of a mix-up

else:
    print("Embedding file not found — generating embeddings.")
    model = SentenceTransformer('all-MiniLM-L6-v2')

    texts = questions['Q_basic_clean'].tolist()
    question_ids = questions['question_id'].to_numpy()

    embeddings = model.encode(
        texts,
        batch_size=64,
        show_progress_bar=True
    )
# Save embeddings to disk along with question IDs in case of mixup
    np.savez_compressed(unlabeled_embeddings_path, embeddings=embeddings, question_ids=question_ids)
    print("Saved embeddings to disk.")

questions['embedding'] = list(embeddings)

Embedding file exists — loading from disk.


In [None]:
## Perform UMAP + HDBSCAN clustering
# Commented out to avoid re-running unnecessarily or acccidentally overwriting

# unlabeled_df, unlabeled_umap, unlabeled_clusterer = cluster_with_umap_hdbscan(
#     df=questions,
#     sample_size=100_000,
#     umap_params={
#         "n_neighbors": 30,
#         "n_components": 5,
#         "metric": "cosine",
#         "random_state": 42
#     },
#     hdbscan_params={
#         "min_cluster_size": 1500,
#         "min_samples": 15,
#         "metric": "euclidean"
#     }
# )

Finished clustering in 4285.2 seconds
Noise ratio: 43.21%
Clusters found: 81
Approx. silhouette score (excluding noise): 0.625


In [None]:
## Saves result of large clustering to disk if not already present
# files_to_save = {
#     "unlabeled_hdbscan_model.pkl": unlabeled_clusterer,
#     "unlabeled_umap_embedding.pkl": unlabeled_umap,
#     "unlabeled_clustered_df.parquet": unlabeled_df
# }

# for filename, obj in files_to_save.items():
#     path = os.path.join(data_dir, filename)
#     if not os.path.exists(path):
#         if filename.endswith(".parquet"):
#             obj.to_parquet(path)
#         else:
#             joblib.dump(obj, path)
#         print(f"Saved {filename}")
#     else:
#         print(f"{filename} already exists. Skipping save.")



unlabeled_hdbscan_model.pkl already exists. Skipping save.
unlabeled_umap_embedding.pkl already exists. Skipping save.
unlabeled_clustered_df.parquet already exists. Skipping save.


In [None]:
## Runs HDBSCAN again on the noise cluster to extract more clusters
# Adjust paramaters for smaller subset of data
hdbscan_params_noise = {
    "min_cluster_size": 400,
    "min_samples": 10,
    "metric": "euclidean"
}

noise_labels_shifted, noise_clusterer = recluster_noise(
    umap_embeddings=unlabeled_umap,
    labels=unlabeled_df["cluster"].to_numpy(),
    hdbscan_params=hdbscan_params_noise
)

Reclustered noise points: 277924
New noise ratio: 56.82%
Clusters found in noise: 136


In [None]:
## Preview new clusters obtained from noise cluster for quick quality check

# Select only the original noise rows
noise_mask = unlabeled_df['cluster'] == -1
noise_df = unlabeled_df.loc[noise_mask, ['Q_basic_clean']].copy()

# Assign the new noise cluster labels (length matches noise_df)
noise_df['new_cluster'] = noise_labels_shifted

print_cluster_examples(noise_df, text_column='Q_basic_clean', cluster_column='new_cluster',examples_per_cluster=5)



--- Cluster 187 (size=10516) ---
- what are tha advantages of dairy farming and tha considerations set during tha setup of a dairy farm?
- What Leads Farmers Failure To Calculate Profits And Losses At The End Of The Season.
- If Am Poor I Want To Bcome A Farmer What Can I Do
- What are the factors to consider when setting up farm building?
- Many people dislike farming Why is it so?

--- Cluster 110 (size=5219) ---
- WHAT IS SOIL TEXTURE?
- HOW CAN YOU CONTROL SOILEROSION
- WHEN SHEET EROSION OCCUR ON YOUR FARM WHAT WILL YOU DO?
- ,which soil doesnt require too much water
- ,do soil affect water retention

--- Cluster 166 (size=2877) ---
- Acaricides Used In Controlling Oxalis
- where can I buy Victoria sate herbicide in mbarara
- How iz it harmful weed'g us'g chemicals.
- Why are some farmers use weedmaster to kill weeds?
- WHEN I SPRAY WEEDS WITH CHEMICALS DO THEY TAKE A LONG TIME WITH OUT COMING BACK?

--- Cluster 182 (size=1860) ---
- ,whech taip of beenci is the best grow in kita

In [None]:
# Integrates new noise cluster labels back into the main dataframe
unlabeled_df.loc[unlabeled_df["cluster"] == -1, "cluster"] = noise_labels_shifted


In [23]:
unlabeled_summary = summarize_clusters(unlabeled_df, 
                        text_col='Q_basic_clean', 
                        cluster_col='cluster', 
                        top_n_words=5, 
                        meta_col='meta_label', 
                        sample_questions=5, 
                        random_samples=True, 
                        preview=True
                        )


=== Cluster -1 (size=157906) ===
Keywords: use, wefarm, does, best, wat
Meta: -1.0
Sample questions:
  - can i get an electrician post in your organisation?
  - ,Are they capable to survive or
  - Where can i get mint seeds pliz members help
  - State the function of acotyledon.
  - SOO FOR WHAT

=== Cluster 0 (size=3449) ===
Keywords: mulching, mulch, importance, mulches, garden
Meta: 4.0
Sample questions:
  - Why Do People Mulch Their Garden?
  - Good Simon, Which Materials Do you use for mulching inorder keep the moist in the garden?
  - what is mulching?
  - am ask a qn to d'stance of mulching from bananaplant to other afarmers for more lnf?
  - What Is Mulching ?

=== Cluster 1 (size=4401) ===
Keywords: ticks, tick, control, best, farm
Meta: 2.0
Sample questions:
  - Please Told Me Medicine Of Poison Of Tick
  - what are the harmful effects of the tick
  - Which is the best caracide for killing ticks?
  - Which are the 2 example of two host tick
  - mention three types of tick fo

In [57]:
# Map clusters to metaclusters based on ChatGPT feedback. 
# Input the prompt found in /notes/LLM_prompts.txt followed by the output of summarize_clusters(), keeping within character limit (I submitted chunks of 40-50 clusters at a time)
# Check for NaNs-- there were about 3 clusters missing that I had to manually add back in from the intial assignments saved in notes/LLM_clusterassignments_unlabeled.txt

cluster_to_meta = {
    **dict.fromkeys([0, 12, 15, 16, 52, 53, 54, 57, 58, 59, 60, 83, 84, 87, 88, 91, 92, 98, 101, 108, 113, 118, 124, 126, 130, 137, 147, 151, 173, 175, 177, 182, 183, 188], 4),  # Planting & Growth
    **dict.fromkeys([1, 7, 9, 13, 34, 35, 45, 46, 47, 48, 85, 99, 107, 131, 133, 140, 144, 150, 165, 166, 168, 169], 2),  # Pests & Disease
    **dict.fromkeys([2, 3, 4, 5, 6, 10, 11, 14, 24, 26, 27, 29, 32, 55, 56, 81, 90, 95, 96, 102, 103, 117, 134, 143, 146, 156, 162, 171, 172, 178, 179, 191, 203], 3),  # Animal Husbandry
    **dict.fromkeys([8, 19, 20, 21, 22, 23, 33, 39, 40, 41, 43, 44, 49, 50, 65, 69, 75, 82, 94, 106, 110, 111, 114, 115, 116, 119, 120, 121, 123, 141, 142, 149, 153, 154, 155, 159, 164, 181, 210], 1),  # Soil, Fertilizer, & Animal Feed
    **dict.fromkeys([74, 122, 132, 148, 184, 185, 204, 205, 206, 207, 208, 209, 176, 174], 5),  # Markets
    **dict.fromkeys([17, 18, 71, 72, 186, 167, 163], 6),  # Finance & Loans
    **dict.fromkeys([25, 28, 31, 42, 76, 86, 93, 100, 112, 129], 7),  # Farming Equipment & Materials
    **dict.fromkeys([30, 51, 97, 109, 128, 138, 180, 192], 8),  # Weather & Environment
    **dict.fromkeys([37, 61, 62, 67, 77, 80, 89, 139, 145, 187, 196, 198, 199], 9),  # Wefarm Platform
    **dict.fromkeys([36, 38, 63, 64, 68, 70, 73, 78, 79, 125, 135, 136, 152, 157, 158, 160, 161, 189, 190, 193, 194, 195, 197, 200, 201, 202, 211, 212, 213, 214, 215, 216], 10),  # Personal Communication
    **dict.fromkeys([66, 104, 105, 127, 170], -1)  # Uncategorized
}
unlabeled_df['meta_label'] = unlabeled_df['cluster'].map(cluster_to_meta)


In [58]:
meta_titles = {
    -1: "Uncategorized", 
    1: "Soil & Fertilizer", 
    2: "Pests & Disease", 
    3: "Animal Husbandry", 
    4: "Planting & Growth", 
    5: "Markets", 
    6: "Finance & Loans", 
    7: "Farming Equipment & Materials", 
    8: "Weather & Environment", 
    9: "Wefarm Platform",
    10: "Personal Communication"
}
# Ensure all -1 clusters have meta_label = -1
unlabeled_df.loc[unlabeled_df['cluster'] == -1, 'meta_label'] = -1

# Map meta_label to titles
unlabeled_df['meta_label_title'] = unlabeled_df['meta_label'].map(meta_titles)

# Fill any unmapped values just in case
unlabeled_df['meta_label_title'] = unlabeled_df['meta_label_title'].fillna("Uncategorized")



In [75]:
unlabeled_df['meta_label_title'].value_counts()

meta_label_title
Uncategorized                    162234
Animal Husbandry                  92409
Soil & Fertilizer                 89456
Personal Communication            72921
Planting & Growth                 61526
Pests & Disease                   50452
Wefarm Platform                   41906
Markets                           24591
Farming Equipment & Materials     21266
Weather & Environment             14615
Finance & Loans                   11772
Name: count, dtype: int64

In [76]:
unlabeled_df.loc[unlabeled_df['meta_label'].isna(), 'cluster'].value_counts()

Series([], Name: count, dtype: int64)

In [80]:
quick_save_file(data_dir = data_dir, 
                filename = "unlabeled_clustered_df.parquet", 
                obj = unlabeled_df
               )

unlabeled_clustered_df.parquet already exists. Skipping save.


In [1]:
# Re-run summarize_clusters to get updated meta labels
# metacluster_preview(unlabeled_summary, 
#                     metacluster_num=1,
#                     meta_titles=meta_titles)

### Figure Generation

In [None]:
## Loads previously saved model and dataframe ready for visualization
# unlabeled_clusterer = joblib.load(os.path.join(data_dir, "unlabeled_hdbscan_model.pkl"))
# unlabeled_df = pd.read_parquet(os.path.join(data_dir, "unlabeled_clustered_df.parquet"))

In [None]:
umap_2d = umap.UMAP(
    n_components=2,
    n_neighbors=50,
    min_dist=0.1,
    metric="cosine",
    random_state=42
)

embedding_2d = umap_2d.fit_transform(list(unlabeled_df['embedding']))

quick_save_file(data_dir, "unlabeled_umap_2d_embedding.npy", embedding_2d)


In [87]:
minimal_df = unlabeled_df[['question_id', 'meta_label', 'cluster']]
minimal_df = minimal_df.copy()
minimal_df['umap_x'] = embedding_2d[:, 0]
minimal_df['umap_y'] = embedding_2d[:, 1]

In [None]:
# Save question clusters with 2D embeddings locally for merging with full dataset later
# save_question_clusters(df = unlabeled_df,
#                        embedding_2d = embedding_2d,
#                        topic = "unlabeled")

Saved ../data/question_clusters_unlabeled.parquet (643148 rows)


In [96]:
minimal_df['question_id']

0          4107267
1          4107349
2          4107354
3          4107424
4          4107802
            ...   
643143    59086651
643144    59086652
643145    59086816
643146    59086934
643147    59087077
Name: question_id, Length: 643148, dtype: object

In [2]:
import plotly.express as px

# --- Build a combined column "<id> — <title>" ---
minimal_df['meta_label_title'] = minimal_df['meta_label'].astype(int).map(meta_titles)
minimal_df['meta_label_full'] = (
    minimal_df['meta_label'].astype(int).astype(str) 
    + " — " 
    + minimal_df['meta_label_title']
)

# Ensure ordered categories based on numeric sort
ordered_labels = sorted(minimal_df['meta_label'].unique())
ordered_full = [
    f"{i} — {meta_titles[i]}" for i in ordered_labels
]
minimal_df['meta_label_full'] = minimal_df['meta_label_full'].astype('category')
minimal_df['meta_label_full'] = minimal_df['meta_label_full'].cat.set_categories(
    ordered_full, ordered=True
)

# --- Color map (must map to full labels!) ---
color_map = {
    f"{i} — {meta_titles[i]}": col 
    for i, col in {
        -1: "black",
         1: "#1f77b4",
         2: "#ff7f0e",
         3: "#2ca02c",
         4: "#d62728",
         5: "#9467bd",
         6: "#8c564b",
         7: "#e377c2",
         8: "#7f7f7f",
         9: "#bcbd22",
        10: "#17becf",
    }.items()
}

# --- Plot ---
fig = px.scatter(
    minimal_df,
    x="umap_x",
    y="umap_y",
    color="meta_label_full",
    color_discrete_map=color_map,
    category_orders={"meta_label_full": ordered_full},
    hover_data=["question_id", "cluster", "meta_label"],
    opacity=0.8
)

fig.update_layout(
    legend_title_text="Meta-Cluster",
    legend=dict(itemclick="toggleothers", itemdoubleclick="toggle"),
    xaxis=dict(scaleanchor="y", scaleratio=1),
    yaxis=dict(scaleanchor="x", scaleratio=1),
    width=800,
    height=800,
)

fig.update_traces(
    selected=dict(marker=dict(opacity=1, size=8)),
    unselected=dict(marker=dict(opacity=0.12))
)

fig.show()


NameError: name 'minimal_df' is not defined

In [56]:
# Find rows where meta_label is NaN
nan_rows = minimal_df[minimal_df['meta_label'].isna()]
print(nan_rows['cluster'].value_counts())
print("Rows with NaN meta_label:")
print(nan_rows)
print(f"\nTotal NaNs: {len(nan_rows)}")



cluster
-1      157906
 145       421
Name: count, dtype: int64
Rows with NaN meta_label:
       question_id  meta_label  cluster     umap_x    umap_y
12         4109374         NaN       -1  13.776208  5.050594
16         4110007         NaN       -1  12.304329  7.068155
18         4110239         NaN       -1  11.391359  7.172901
19         4110651         NaN       -1  10.782326  6.167444
21         4110944         NaN       -1   8.109047  9.294688
...            ...         ...      ...        ...       ...
643132    59085548         NaN       -1  12.981956  7.467135
643135    59085656         NaN       -1  12.977041  7.469586
643139    59086264         NaN       -1   5.015298  4.617218
643141    59086616         NaN       -1  14.219928  8.536251
643147    59087077         NaN       -1  16.646841  6.435987

[158327 rows x 5 columns]

Total NaNs: 158327


In [90]:
import pandas as pd
import plotly.express as px

# --- Compute centroids and cluster sizes ---
centroids = (
    minimal_df
    .groupby(["cluster", "meta_label"], as_index=False)
    .agg(
        umap_x=("umap_x", "mean"),
        umap_y=("umap_y", "mean"),
        size=("cluster", "count"),
    )
)

# --- Ensure meta_label is ordered ---
ordered_labels = sorted(centroids["meta_label"].unique())
centroids["meta_label"] = centroids["meta_label"].astype("category")
centroids["meta_label"] = centroids["meta_label"].cat.set_categories(
    ordered_labels, ordered=True
)

# --- Color map ---
color_map = {
    -1: "black",
    1: "#1f77b4", 2: "#ff7f0e", 3: "#2ca02c", 4: "#d62728",
    5: "#9467bd", 6: "#8c564b", 7: "#e377c2", 8: "#7f7f7f",
    9: "#bcbd22", 10: "#17becf",
}

# --- Uniformly scale all point sizes (e.g., 2× bigger) ---
scale_factor = 2.0
centroids["scaled_size"] = centroids["size"] * scale_factor

# --- Plot ---
fig = px.scatter(
    centroids,
    x="umap_x",
    y="umap_y",
    color="meta_label",
    color_discrete_map=color_map,
    size="scaled_size",
    size_max=120,   # ensures upper bound isn't clipped
    category_orders={"meta_label": ordered_labels},
    hover_data={
        "cluster": True,
        "meta_label": True,
        "size": True,
        "umap_x": ':.2f',
        "umap_y": ':.2f',
    },
)

fig.update_layout(
    title="Centroid Map of Clusters in 2D UMAP Space",
    width=800,
    height=800,
    legend_title="Meta-Cluster",
    xaxis=dict(scaleanchor="y", scaleratio=1),
    yaxis=dict(scaleanchor="x", scaleratio=1),
    legend=dict(
        itemclick="toggleothers",   # click highlights
        itemdoubleclick="toggle",   # double-click isolates
    ),
)

fig.update_traces(
    selected=dict(marker=dict(opacity=1)),
    unselected=dict(marker=dict(opacity=0.15))
)

fig.show()


In [91]:
print(meta_titles)

{-1: 'Uncategorized', 1: 'Soil & Fertilizer', 2: 'Pests & Disease', 3: 'Animal Husbandry', 4: 'Planting & Growth', 5: 'Markets', 6: 'Finance & Loans', 7: 'Farming Equipment & Materials', 8: 'Weather & Environment', 9: 'Wefarm Platform', 10: 'Personal Communication'}


### Automated meta-clustering attempt
(not successful)

In [112]:
from umap import UMAP
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Compute centroids
cluster_centroids = (
    unlabeled_df[unlabeled_df['cluster'] != -1]['embedding']
    .groupby(unlabeled_df['cluster'])
    .apply(lambda x: np.mean(np.vstack(x), axis=0))
)
centroids = np.vstack(cluster_centroids.values)

# Optional: reduce dimensionality before meta-clustering
umap_model = UMAP(n_components=2, metric='cosine', random_state=42)
centroids_2d = umap_model.fit_transform(centroids)

# Agglomerative clustering on reduced embeddings
meta_clusterer = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=0.55,
    linkage='average'
)
meta_labels_no_noise = meta_clusterer.fit_predict(centroids_2d)

# Map back to original clusters
meta_labels = dict(zip(cluster_centroids.index, meta_labels_no_noise))
meta_labels[-1] = -1
unlabeled_df['meta_cluster'] = unlabeled_df['cluster'].map(meta_labels)




In [113]:
for lbl, group in unlabeled_df.groupby("meta_cluster"):
    print("Number of Meta-clusters:", len(unlabeled_df['meta_cluster'].unique()))
    print(f"\n--- Meta-cluster {lbl} ---")
    print(group["cluster"].value_counts())

Number of Meta-clusters: 29

--- Meta-cluster -1 ---
cluster
-1    22134
Name: count, dtype: int64
Number of Meta-clusters: 29

--- Meta-cluster 0 ---
cluster
9     2667
13    1530
10     866
8      668
41     449
42     169
Name: count, dtype: int64
Number of Meta-clusters: 29

--- Meta-cluster 1 ---
cluster
3     904
1     668
64    350
65    299
47    254
49    170
48    150
Name: count, dtype: int64
Number of Meta-clusters: 29

--- Meta-cluster 2 ---
cluster
39    243
50    225
77    224
45    207
57    193
Name: count, dtype: int64
Number of Meta-clusters: 29

--- Meta-cluster 3 ---
cluster
30    877
68    356
63    344
62    338
Name: count, dtype: int64
Number of Meta-clusters: 29

--- Meta-cluster 4 ---
cluster
36    1995
34    1339
92     409
94     360
90     218
84     193
Name: count, dtype: int64
Number of Meta-clusters: 29

--- Meta-cluster 5 ---
cluster
6     1687
18    1156
51     218
52     183
Name: count, dtype: int64
Number of Meta-clusters: 29

--- Meta-cluster 6 -

In [114]:
for lbl, group in unlabeled_df.groupby("meta_cluster"):
    print(f"\n--- Meta-cluster {lbl} ---")
    print(group["Q_basic_clean"].sample(10).tolist())


--- Meta-cluster -1 ---
['what is tilling of the land', 'Wat are pleateaus ?', 'what is omnivorous', 'Aminah asks: How can i plan matooke? Reply Q25 followed by your response.', 'which Pesticides is the best for controling weevils', 'what is the meaning of the word brids', 'I dot knowe where do we get bt even me i dot where do you get', 'Is God real?', 'What can ido? Am apoutry farmer.', 'What Is The Causal Organism Of Anthrax']

--- Meta-cluster 0 ---
['what is Soil leaching?', 'What is the BEST method of irrigation?', 'How Can I Now The Characteristics Of A Good Soil', 'What Is Soil Leaching?', 'How can soil gain its fertility?', 'discribe for me the proces of cleaning water in the farm', 'How often should I give it water', 'Where will I get a money maker water pumb machine and at how much, in kericho?', 'how the soil loose fertility in the soil', 'give 2 ways how soil is formed']

--- Meta-cluster 1 ---
['which medicine are suitable 2 attack armyworm?', "E if some begins a rat farm