In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from collections import Counter
import time

from bertopic import BERTopic
from itertools import chain
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from umap import UMAP
from hdbscan import HDBSCAN

In [2]:
email_df = pd.read_pickle('Email/emails_data_processed.pkl')

In [3]:
flattened_passages = list(chain.from_iterable(email_df['Passages']))

In [4]:
len(flattened_passages)

337215

In [5]:
umap_model = UMAP(n_neighbors=5, min_dist=0.3, random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=145, gen_min_span_tree=True, prediction_data=True)

In [6]:
topic_model = BERTopic(n_gram_range=(1, 3), top_n_words=10,
                      umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True)
topics, probs = topic_model.fit_transform(flattened_passages)
topic_model.save("Email/bertopic_model")

Batches:   0%|          | 0/10538 [00:00<?, ?it/s]

2023-11-07 15:20:20,988 - BERTopic - Transformed documents to Embeddings
2023-11-07 15:31:11,859 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokeni

In [7]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,199971,-1_for_to_the_and,"[for, to, the, and, you, of, in, by, this, our]",[DONATE: Paid for by LisaBlunt Rochester for C...
1,0,55358,0_we_to_our_the,"[we, to, our, the, and, in, this, you, of, can]","[We've set a goal to raise $200,000 inthe next..."
2,1,6494,1_abortion_roe_wade_roe wade,"[abortion, roe, wade, roe wade, reproductive, ...","[Peter, With the Supreme Court’s devastating d..."
3,2,3239,2_georgia_stacey_abrams_kemp,"[georgia, stacey, abrams, kemp, stacey abrams,...",[Will you rush a donation right now to help el...
4,3,2704,3_pennsylvania_fetterman_john_pa,"[pennsylvania, fetterman, john, pa, to, derek,...",[John John Fetterman Lieutenant Governor of Pe...
...,...,...,...,...,...
162,161,152,161_great maga king_maga king_great maga_status,"[great maga king, maga king, great maga, statu...","[**_ But, I didn't want to send you _JUST_ an..."
163,162,152,162_elissa_slotkin_gosar for congress_gosar for,"[elissa, slotkin, gosar for congress, gosar fo...",[** PAID FOR BY GOSAR FOR CONGRESS------------...
164,163,151,163_king_dr_luther_dr king,"[king, dr, luther, dr king, luther king, marti...",[“Life’s most persistent and urgent question i...
165,164,150,164_can click here_you can click_can click_if ...,"[can click here, you can click, can click, if ...",[We rely on grassrootssupporters like you to f...


In [8]:
topic_model.get_topic_info().to_pickle('Email/email_cluster_bert.pkl')

In [9]:
topic_model.visualize_hierarchy().write_html('Email/email-bert-clust.html')

In [10]:
email_passage_splits = [len(embedding) for embedding in email_df['Passages']]

In [11]:
passage_mapping ={}
j = 0
for i in range(len(email_passage_splits)):
    passages_count = email_passage_splits[i]
    passage_mapping[i] = topics[j:j+passages_count]
    j += passages_count

In [12]:
email_df['cluster_ids'] = passage_mapping.values()

In [13]:
email_df.to_pickle('Email/email_cluster_ids.pkl')

# 