# BERTopic model

In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

import tensorflow as tf
# from cuml.cluster import HDBSCAN
# from cuml.manifold import UMAP
# from cuml.preprocessing import normalize

from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer 
import string
import re
import nltk
from bs4 import BeautifulSoup   
import contractions,unicodedata


nltk.download('stopwords')                              # Download Stopwords.
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import PorterStemmer                     # Stemmer
from nltk.corpus import stopwords                       # Import stopwords.
from nltk.tokenize import word_tokenize, sent_tokenize  # Import Tokenizer.
from nltk.stem.wordnet import WordNetLemmatizer         # Import Lemmatizer.



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\p_uli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\p_uli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\p_uli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data= pd.read_csv(r'C:\Users\p_uli\Desktop\Columbia University\Cursos\Fall 22\Capstone\Data\split_threads_cleaned.csv')

In [3]:
#data2= data.sample(n=20000)
data2= data.sample(n=10000)#,random_state=42)
if type(data2['clean_text']) is list:
    text = data2['clean_text']
else:
    text = data2['clean_text'].tolist()

text=[str(x) for x in text]

## Model

In [4]:
# Defining the grid to implement Random Search CV

## BERTopic hyperparameters
# transformers. Taken from https://www.sbert.net/docs/pretrained_models.html
s_transf=['all-MiniLM-L6-v2','all-MiniLM-L12-v2']

# top n words per topic, it is recommended to keep this number between 10-20
top_n_words=[int(x) for x in np.linspace(10,20,6)]

# ngram range: Let's try 1,2 and 3-grams
ngram_r=[(1,1),(1,2),(1,3)]

# min topic size
min_topic_s=[int(x) for x in np.linspace(10,200,6)]

## UMAP hyperparameters
# number of neighbors
n_neigh=[int(x) for x in np.linspace(10,50,6)]

## HDBSCAN hyperparameters
# min cluster size
min_cl_s=[int(x) for x in np.linspace(10,50,6)]



In [5]:
# Random Search 

In [6]:
def create_model(transformer,topwords,n_gram,min_top, nn, min_cluster):
    # embedding
    sentence_model = SentenceTransformer(transformer)
    
    # ngrams
    vectorizer_model = CountVectorizer(ngram_range=n_gram, stop_words="english")
    
    # UMAP: dimensionality reduction
    umap_model = UMAP(n_neighbors=nn, min_dist=0.0, metric='cosine', random_state=42)
    
    # HDBSCAN: clustering
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster, metric='euclidean', 
                            cluster_selection_method='eom', prediction_data=True, min_samples=5)
    
    # BERTopic
    model = BERTopic(
        top_n_words=topwords,
        min_topic_size=min_top,
        vectorizer_model=vectorizer_model,
        language='english', calculate_probabilities=True,
        embedding_model=sentence_model,
        verbose=True,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
    )
    
    return model

In [8]:
model=create_model(s_transf[0],top_n_words[0],ngram_r[0],min_topic_s[0],10,min_cl_s[0])
model

<bertopic._bertopic.BERTopic at 0x1eae499a7f0>

In [9]:
%%time
with tf.device('/GPU:0'):
    topics, probs = model.fit_transform(text)


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2022-10-22 13:43:42,638 - BERTopic - Transformed documents to Embeddings
2022-10-22 13:44:05,362 - BERTopic - Reduced dimensionality
2022-10-22 13:44:37,275 - BERTopic - Clustered reduced embeddings


CPU times: total: 15min 35s
Wall time: 3min 7s


## Results

In [10]:
freq = model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,3459,-1_image_thank_know_email
1,0,172,0_weekend_fun_good_mom
2,1,113,1_clr_agreement_counterparty_credit
3,2,102,2_dbcapsdata_database_error_alias
4,3,78,3_game_fantasy_wr_update
5,4,76,4_perlingiere_dperlin_debra_smith
6,5,76,5_vince_shall_shirley_thank
7,6,75,6_esmtp_receive_returnpath_smtp
8,7,75,7_jeff_best_hee_talk
9,8,74,8_enronxgate_acc_aps_industrials


## Visualization

In [13]:
model.visualize_topics()

In [49]:
#model.visualize_hierarchy()

In [14]:
model.visualize_barchart()

In [12]:
# # Document visualization (left fot the best models)

# from umap import UMAP
# # Prepare embeddings

# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = sentence_model.encode(text, show_progress_bar=False)

# # Train BERTopic
# topic_model = BERTopic().fit(text, embeddings)

# # Run the visualization with the original embeddings
# topic_model.visualize_documents(text, embeddings=embeddings)

# # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(text, reduced_embeddings=reduced_embeddings)
