In [1]:
pip install sentence-transformers umap-learn hdbscan matplotlib


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hdbscan
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.m

In [2]:
pip install bertopic


Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m112.6/154.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bertopic
Successfully installed bertopic-0.16.0


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import umap
import hdbscan
from bertopic import BERTopic
import matplotlib.pyplot as plt
from tqdm import tqdm

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load your dataset
df = pd.read_csv('/content/cleaned_utterances.csv')  # Replace with your file path
utterances = df['Utterances'].tolist()  # Replace 'UtteranceColumn' with your column name

# Generate embeddings
embeddings = model.encode(utterances, show_progress_bar=True)

# Dimensionality reduction with UMAP
umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')
umap_embeddings = umap_model.fit_transform(embeddings)

# Clustering with HDBSCAN
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')

# Create a topic model with a minimum topic size of 50
topic_model = BERTopic(umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       min_topic_size=50)
topics, _ = topic_model.fit_transform(utterances)

# Visualize top words in each cluster with a progress bar
for topic in tqdm(set(topics), desc="Plotting Topics"):
    if topic != -1:  # Ignore the outlier cluster
        plt.figure(figsize=(10, 4))
        topic_words = topic_model.get_topic(topic)
        if topic_words:  # Check if topic_words is not empty
            words = [word for word, _ in topic_words[:7]]
            scores = [score for _, score in topic_words[:7]]
            plt.barh(words, scores)
            plt.gca().invert_yaxis()
            plt.title(f"Cluster {topic}")
            plt.show()


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/6382 [00:00<?, ?it/s]