In [None]:
!pip install umap hdbscan bertopic

In [None]:
import os
import sys
import pickle

# Detect if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "/content/drive/MyDrive/Smiles Discourse Analysis"
else:
    base_dir = "path/to/your/local/project/folder" # add directory if running locally

In [None]:
import pickle
# Define save directory
pickle_dir = os.path.join(base_dir, "pickles")
os.makedirs(pickle_dir, exist_ok=True)

# open  pickles correctly
with open(os.path.join(pickle_dir, 'self_help.pkl'), 'rb') as f:
    self_help_sentences = pickle.load(f)

with open(os.path.join(pickle_dir, 'thrift.pkl'), 'rb') as f:
    thrift_sentences = pickle.load(f)

with open(os.path.join(pickle_dir, 'self_help_embeddings.pkl'), 'rb') as f:
    sh_embeddings = pickle.load(f)

with open(os.path.join(pickle_dir, 'thrift_embeddings.pkl'), 'rb') as f:
    th_embeddings = pickle.load(f)

In [None]:
import umap
from umap.umap_ import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
# These are taken from the Bertopic docs as a way of fine tuning Bertopics
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=2,
    ngram_range=(1, 2)
)


In [None]:
# If you are unsatisifed with your results change min cluster, n_neighbors, min_topic_size.
# In basic terms you are looking to have as small a count in -1 as possible, with coherent topics, each with some consistency in size.
# However, your goal will determine how you use Bertopic and it is recommended going for a closer looking into the docs for more detail.
hdbscan_model = HDBSCAN(min_cluster_size=25, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
UMAP(n_neighbors=10, min_dist=0.2, n_components=2, metric='cosine')
from sklearn.preprocessing import normalize
import numpy as np

# This begins with self-help
embeddings = np.array(sh_embeddings)
embeddings = normalize(embeddings, norm='l2', axis=1)
topic_model = BERTopic(
    embedding_model=None,
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model = hdbscan_model,
    min_sample = 15,
    min_topic_size=25,                  # increase to merge small/fuzzy clusters
    nr_topics=45,                   # can also specify e.g. 50
    top_n_words=10,                     # how many words shown per topic
    verbose=True
)
topics, probs = topic_model.fit_transform(self_help_sentences, embeddings)

In [None]:
import os
# Step 1: Get full topic metadata from BERTopic
topic_info = topic_model.get_topic_info()

# Step 2: Exclude outliers (Topic -1 is usually noise)
topic_info_filtered = topic_info[topic_info["Topic"] != -1]

# Step 3: Get top 20 topics by frequency
top_df = topic_info_filtered.nlargest(20, "Count").copy()
print(topic_info)

In [None]:
# This provides a short overview of the topics in a way which can be quickly seen.
# The goal of this graph is to help with understanding issues with the topics.
# If you want to use this graph in your work, you will need to add custom topic names to it
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.barh(top_df["Name"], top_df["Count"], color="darkgreen")
plt.xlabel("Sentence Count")
plt.title("Top BERTopic Clusters (Cleaned & Interpretable)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
# These graphs can also help to understand the topics more.
# Look for internal consistency and overlapping terms
topic_model.visualize_barchart(top_n_topics=20)

In [None]:
#Choose a filename
top_df.to_csv(os.path.join(save_path, ".csv"))

In [None]:
import itertools
import pandas as pd
# This cell has no output. It prepare the variables for the next cell which then outputs the visual UMAP representation of the topic model
# This is taken from the Bertopic documentation and ammended only slightly.

topic_info_filtered = topic_info[topic_info["Topic"] != -1]

umap_model = UMAP(
    n_neighbors=10,
    n_components=2,
    min_dist=0.2,  # Try 0.2–0.3 for clearer spread
    metric='cosine',
    random_state=42
)
reduced_embeddings_2d = umap_model.fit_transform(embeddings)


# Step 3: Get top 20 topics by frequency
top_df = topic_info_filtered.nlargest(20, "Count").copy()

# Define colors for the visualization to iterate over
colors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
top_topics = set(top_df["Topic"].astype(str))
color_key = {topic: next(colors) for topic in top_topics}

doc_topics = [str(t) for t in topic_model.topics_]

# Full UMAP plot dataframe, filtered to Top 20 topics
df = pd.DataFrame({
    "x": reduced_embeddings_2d[:, 0],
    "y": reduced_embeddings_2d[:, 1],
    "Topic": doc_topics,
    "Length": [len(doc) for doc in all_sentences]
})

df = df[df["Topic"].isin(top_topics)]
df = df[(df.y > -10) & (df.y < 10) & (df.x > -10) & (df.x < 10)]
df["Topic"] = df["Topic"].astype("category")

mean_df = df.groupby("Topic").mean(numeric_only=True).reset_index()
mean_df["Topic"] = mean_df["Topic"].astype(int)
mean_df = mean_df.sort_values("Topic")

In [None]:
# This gives a rough look at the topic model to see whether there are any immediate errors.
# If everything is looking acceptable, then it is time to label then create the legend

import matplotlib.pyplot as plt
import seaborn as sns

# Recalculate ranges and padding
x_center = df["x"].mean()
y_center = df["y"].mean()

x_range = df["x"].max() - df["x"].min()
y_range = df["y"].max() - df["y"].min()

padding = 0.1  # 10% padding

x_pad = x_range * padding
y_pad = y_range * padding

#plt.figure(figsize=(10, 8))

sns.scatterplot(
    data=df,
    x="x",
    y="y",
    hue="Topic",
    palette=color_key,
    s=10,
    alpha=0.6,
    legend=False
)

# Recenter around midpoint
plt.xlim(x_center - x_range / 2 - x_pad, x_center + x_range / 2 + x_pad)
plt.ylim(y_center - y_range / 2 - y_pad, y_center + y_range / 2 + y_pad)

plt.title("UMAP Projection of Top 20 Topics", fontsize=16)
plt.axis("off")
plt.tight_layout()

#plt.savefig("umap_centered.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Here is an example list of custom topic labels. You will need to develop them by hand.
# You can export top_df to a LLM then ask it to use that data to create the labels.
custom_topic_labels = {
    0: "Unite Community",
    1: "Climate and fuel issues",
    2: "NHS, health and housing",
    3: "Labour Party relations",
    4: "Food banks",
    5: "Local branch reports",
    6: "Better buses campaign",
    7: "Universal Credit and disability",
    8: "Palestine and solidarity",
    9: "Schools and education",
    10: "Branch roles and meetings",
    11: "Reports and dates",
    12: "Branch motions",
    13: "Stalls and outreach",
    14: "Local events",
    15: "Meetings and forthcoming events",
    16: "Young members and social media",
    17: "Campaigning",
    18: "Education industrial issues",
    19: "Far right resistance"
}


In [None]:
# This graph should be the final version with

plt.figure(figsize=(18, 12))
x_low, x_high = np.percentile(df["x"], [0.5, 99.5])
y_low, y_high = np.percentile(df["y"], [0.5, 99.5])

df_filtered = df[(df["x"] > x_low) & (df["x"] < x_high) & (df["y"] > y_low) & (df["y"] < y_high)]
df_filtered = df.copy()

sns.scatterplot(
    data=df_filtered,
    x="x",
    y="y",
    hue="Topic",
    palette=color_key,
    s=60,           # ⬅️ BOLDER markers
    alpha=0.7,
    legend=False
)

# Make sure to change the title when changing dataset
plt.title("Top 20 Topics Self-Help", fontsize=27)
plt.axis("off")
plt.tight_layout()

# Custom legend
legend_handles = [
    mpatches.Patch(color=color_key[str(i)], label=custom_topic_labels[i])
    for i in sorted(custom_topic_labels.keys())
]

plt.legend(
    handles=legend_handles,
    loc="upper left",
    bbox_to_anchor=(0.97, 1),
    borderaxespad=0.,
    title="Topics",
    fontsize=25,
    title_fontsize=27,
    ncol=1,
    frameon=True
)

# Export
save_path = r'/content/drive/MyDrive/final exports' # choose a directory, such as this
#plt.savefig(os.path.join(save_path,"umap.png"), dpi=300, bbox_inches="tight")
plt.show()
