In [1]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from umap import UMAP
from hdbscan import HDBSCAN

conn = sqlite3.connect('../EconomicsFinal.db')

df = pd.read_sql_query("SELECT title, year FROM EconomicsFinal", conn)

embedding_model = SentenceTransformer('sentence-transformers/stsb-roberta-base')

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


my_stop_words = ["on", "of","the","in","the","for"]

tfidf_model = TfidfVectorizer(stop_words=["english"] + my_stop_words, ngram_range=(1, 3))

ctfidf_model = ClassTfidfTransformer()

representation_model = KeyBERTInspired()

topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=tfidf_model,             # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)

topic_model.fit(df['title'])


<bertopic._bertopic.BERTopic at 0x14cd5ba66a0>

In [3]:
topic_model.visualize_barchart(top_n_topics=10, width=600,height=350)

In [4]:
import sqlite3
import pandas as pd
import re
from bertopic import BERTopic

topics = [2,3,7,8]

# Generate topics over time
topics_over_time = topic_model.topics_over_time(df['title'], df['year'], nr_bins=12)

# Visualize the frequency of topics over time using Plotly
topic_model.visualize_topics_over_time(topics_over_time,topics=topics)

In [7]:
topic_model.save("TitleRoBERTa")