In [None]:
!pip install bertopic
!pip install sentence_transformers
!pip install umap-learn
!pip install hdbscan
!pip install openai

Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [None]:
from bertopic import BERTopic   # Note: change the origin code from "from umap import UMAP" to "import umap.umap_ as UMAP"
from sentence_transformers import SentenceTransformer
from umap import UMAP
import pandas as pd
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
#from nltk.corpus import stopwords
#import nltk
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

In [None]:
data = pd.read_csv("/content/reddit_posts_processed.csv", encoding = "utf-8")
num_rows = data.shape[0]
num_rows

5051

In [None]:
# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
openai.api_key = "sk-xxx"
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(model="gpt-3.5-turbo",
              exponential_backoff=True,
              chat=True,
              prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [None]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2),
                    stop_words="english")

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english',
    calculate_probabilities=True,
    representation_model=representation_model,
    verbose=True
    )

topics, probs = model.fit_transform(data["body"])

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/158 [00:00<?, ?it/s]

2023-09-09 11:32:13,288 - BERTopic - Transformed documents to Embeddings
2023-09-09 11:32:46,883 - BERTopic - Reduced dimensionality
2023-09-09 11:32:48,163 - BERTopic - Clustered reduced embeddings


In [None]:
freq = model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,2476,-1_just_said_like_im,"[just, said, like, im, told, dont, time, didnt...","[relationship, house, friends, sister, leave, ...","[just, said, like, im, told, dont, time, didnt...","[time, friends, friend, family, work, mom, day...",[TL:DR: I own a house that I'm not in often. I...
1,0,166,0_wedding_dress_married_family,"[wedding, dress, married, family, fiance, just...","[wedding party, wedding, bride, weddings, brid...","[wedding, dress, married, family, fiance, just...","[wedding, dress, married, family, fiance, part...",[(New account because I don't want my FMIL to ...
2,1,134,1_dog_dogs_house_puppy,"[dog, dogs, house, puppy, home, im, poop, just...","[dog, dogs, animals, puppy, leash, kennel, nei...","[dog, dogs, house, puppy, home, im, poop, just...","[dog, dogs, house, puppy, home, poop, time, ya...","[In most public dog parks, if a dog is too agg..."
3,2,123,2_rent_room_roommate_apartment,"[rent, room, roommate, apartment, living, hous...","[roommate, apartment, rent, pay rent, landlord...","[rent, room, roommate, apartment, living, hous...","[rent, room, roommate, apartment, living, hous...","[Hey so to fill you in, my little brother got ..."
4,3,121,3_clean_cleaning_apartment_toilet,"[clean, cleaning, apartment, toilet, dishes, b...","[cleaning, laundry, washing, clean, dishes, di...","[clean, cleaning, apartment, toilet, dishes, b...","[clean, cleaning, apartment, toilet, dishes, b...","[Good afternoon everyone! \n\nSo, today, the f..."
5,4,120,4_money_pay_work_job,"[money, pay, work, job, hes, im, time, just, d...","[income, spending, spend, savings, household, ...","[money, pay, work, job, hes, im, time, just, d...","[money, pay, work, job, time, help, husband, h...",[I (F28) is in a relationship for over 7 years...
6,5,111,5_dad_mom_father_years,"[dad, mom, father, years, relationship, told, ...","[dad, father, grandfather, grandmother, aunt, ...","[dad, mom, father, years, relationship, told, ...","[dad, mom, father, years, relationship, brothe...",[Throwaway account to protect privacy. First t...
7,6,99,6_food_eat_like_cook,"[food, eat, like, cook, cooking, just, eating,...","[meal, meals, eating, dishes, dinner, food, ea...","[food, eat, like, cook, cooking, just, eating,...","[food, cooking, dinner, kitchen, meal, work, m...",[My wife and I cannot experience new foods tog...
8,7,88,7_cat_cats_litter_kitten,"[cat, cats, litter, kitten, animals, house, ho...","[care cats, cat, care cat, cats, pets, kitten,...","[cat, cats, litter, kitten, animals, house, ho...","[cat, cats, litter, kitten, animals, house, ho...",[My (37F) best friend (36F) always claimed to ...
9,8,80,8_friends_friend_like_relationship,"[friends, friend, like, relationship, girlfrie...","[bestfriend, girlfriend, friendship, gf, relat...","[friends, friend, like, relationship, girlfrie...","[friends, friend, relationship, girlfriend, ex...",[Hello reddit. Long time lurker and first time...


In [None]:
model.visualize_hierarchy(custom_labels=True)

In [None]:
# visualize topics
model.visualize_topics(custom_labels=True)

In [None]:
# embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# pre-calculate embeddings
embeddings = embedding_model.encode(data["body"], show_progress_bar=True)

# reduce dimensionality of embeddings
reduced_embeddings = UMAP(n_neighbors=10,
                n_components=2,
                min_dist=0.0,
                metric='cosine').fit_transform(embeddings)

Batches:   0%|          | 0/158 [00:00<?, ?it/s]

In [None]:
# visualize the documents in 2-dimensional space
model.visualize_documents(data["title"], reduced_embeddings=reduced_embeddings, custom_labels=True)

In [None]:
file_name = "topic_info.xlsx"

# Save the DataFrame as an Excel file
df = model.get_topic_info()
df.to_excel(excel_writer=file_name, index=False)