In [1]:
import pandas as pd
import numpy as np

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import umap
import hdbscan

import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

import warnings
warnings.filterwarnings('ignore')

In [2]:
# FULL DATA
askmen = pd.read_csv('data/cleaned_askmen.csv') 
askwomen = pd.read_csv('data/cleaned_askwomen.csv')
askmen_docs = askmen['title'].to_list()
askwomen_docs = askwomen['title'].to_list()
askmen_embeddings = np.load('data/askmen_embeddings.npy')
askwomen_embeddings = np.load('data/askwomen_embeddings.npy')

In [18]:
# SAMPLE DATA
n = 50000 
askmen_sample = askmen.sample(n, random_state=0) 
askwomen_sample = askwomen.sample(n, random_state=0)
askmen_sample_docs = askmen_sample['title'].to_list()
askwomen_sample_docs = askwomen_sample['title'].to_list()
askmen_sample_embeddings = askmen_embeddings[askmen_sample.index]
askwomen_sample_embeddings = askwomen_embeddings[askwomen_sample.index]

In [40]:
#Citation: https://github.com/MaartenGr/BERTopic/issues/90 
def coherence(docs, topic_model, topics):
    documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Use .get_feature_names_out() if you get an error with .get_feature_names()
    words = vectorizer.get_feature_names()

    # Extract features for Topic Coherence evaluation
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    # Extract words in each topic if they are non-empty and exist in the dictionary
    topic_words = []
    for topic in range(len(set(topics))-topic_model._outliers):
        words = list(zip(*topic_model.get_topic(topic)))[0]
        words = [word for word in words if word in dictionary.token2id]
        topic_words.append(words)
    topic_words = [words for words in topic_words if len(words) > 0]

    # Evaluate Coherence
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_npmi') 
    coherence = coherence_model.get_coherence()
    return coherence

def diversity(topic_model, topics):
    bertopic_topics = [[topic_words[0] for topic_words in topic_model.get_topic(i)[:10]] for i in range(len(set(topics))-1)]
    result = dict()
    result['topics'] = bertopic_topics
    return TopicDiversity().score(result)

In [29]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #reduce impact of frequent words
vectorizer_model = CountVectorizer(stop_words="english", min_df=10, ngram_range=(1, 2)) #remove stop words, min_df=10>1 to reduce memory

umap_model = umap.UMAP(random_state=0, #ensure reproducibility
                      n_components=10, #dimensionality of the space
                      n_neighbors=15, #controls how UMAP balances local versus global structure in the data
                      low_memory=True,
                      min_dist=0.0) 

seed_topic_list = [['sexist','stereotype','myth'], #gender biases
                   ['insecurity','confidence','esteem'], #self-esteem
                   ['depression','anxiety','mental','loneliness'], #mental health
                   ['consent','rape','assault','harassment'], #consent
                   ['job','career','workplace'], #career
                   ['looks','ugly','attractive'], #appearance
                   ['hygiene','clean','shower'], #clean
                   ['tall','short','height'], #height
                   ['fat','skinny','weight'], #weight
                   ["penis", "balls", "dick"], #male genitals
                   ['vagina','breasts','boobs'], #female genitals
                   ['breakup','ex','heartbreak','dumped'], #breakup 
                   ['cheating','betrayal','trust'], #cheating 
                   ['virginity','first','lose'], #first times
                   ['porn','media','watching','video'], #porn
                   ['relationship','girlfriend','boyfriend'], #relationship
                   ['drinking','alcohol','drunk','drugs','weed'], #drugs
                   ['friends','friendship'], #friends
                   ['sex','intercourse','penetration','oral','orgasm','masturbation','fetish'], #sex
                   ['flirt','date','single','crush','tinder'], #dating phase                       
                  ],

In [46]:
def test_hyperparameters(docs, embeddings, cluster_sizes):
    num_topics = []
    num_outliers = []
    tc = []
    td = []
    for i in cluster_sizes:
        hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=i, 
                                min_samples=1,
                                metric='euclidean', #default
                                cluster_selection_method='eom',
                                prediction_data=True) 
        topic_model = BERTopic(
                            hdbscan_model = hdbscan_model,
                            umap_model = umap_model,
                            #seed_topic_list = seed_topic_list,
                            vectorizer_model = vectorizer_model,
                            ctfidf_model = ctfidf_model,
                            calculate_probabilities = False,  #faster runtime
                          )
        topics, _ = topic_model.fit_transform(docs, embeddings)
        num_topics.append(len(set(topics)))
        num_outliers.append(topics.count(-1))
        tc.append(coherence(docs, topic_model, topics)) 
        td.append(diversity(topic_model, topics))
    stats = pd.DataFrame({'min_topic_size': cluster_sizes,
                          'num_topics': num_topics,
                          'num_outliers': num_outliers,
                          'topic_coherence': tc,
                          'topic_diversity': td})
    return stats

In [65]:
# SAMPLE DATA
n = 100000 
askmen_sample = askmen.sample(n, random_state=0) 
askwomen_sample = askwomen.sample(n, random_state=0)
askmen_sample_docs = askmen_sample['title'].to_list()
askwomen_sample_docs = askwomen_sample['title'].to_list()
askmen_sample_embeddings = askmen_embeddings[askmen_sample.index]
askwomen_sample_embeddings = askwomen_embeddings[askwomen_sample.index]

In [66]:
%%time
# FULL DATA
cluster_sizes = [350, 500, 1000]
askmen_stats = test_hyperparameters(askmen_sample_docs, askmen_sample_embeddings, cluster_sizes)
askmen_stats #no seed topics

Wall time: 8min 45s


Unnamed: 0,min_topic_size,num_topics,num_outliers,topic_coherence,topic_diversity
0,350,65,35465,0.075413,0.896875
1,500,49,36354,0.105936,0.925
2,1000,28,44312,0.147715,0.944444


### DO NOT DELETE

In [64]:
askmen_50000 = pd.concat([askmen_stats, askmen_stats2])
askmen_50000['num_samples'] = 50000
askmen_50000

Unnamed: 0,min_topic_size,num_topics,num_outliers,topic_coherence,topic_diversity,num_samples
0,200,55,17046,0.007973,0.855556,50000
1,250,46,16398,0.064163,0.891111,50000
2,300,43,16424,0.0713,0.907143,50000
3,350,40,17379,0.077352,0.917949,50000
0,400,35,18105,0.084226,0.920588,50000
1,500,30,19497,0.087683,0.927586,50000
2,800,17,19732,0.117812,0.925,50000
3,1000,13,18825,0.12909,0.916667,50000


In [55]:
%%time
askwomen_stats = test_hyperparameters(askwomen_sample_docs, askwomen_sample_embeddings, cluster_sizes)
askwomen_stats #no seed topics

Wall time: 10min 39s


Unnamed: 0,min_topic_size,num_topics,num_outliers,topic_coherence,topic_diversity
0,200,55,16587,0.027757,0.85
1,250,49,17246,0.028512,0.864583
2,300,44,16639,0.050779,0.883721
3,350,39,17639,0.06091,0.892105


In [52]:
%%time
askmen_stats = test_hyperparameters(askmen_sample_docs, askmen_sample_embeddings, cluster_sizes)
askmen_stats #no seed topics

Wall time: 10min 42s


Unnamed: 0,min_topic_size,num_topics,num_outliers,topic_coherence,topic_diversity
0,100,111,15448,-0.074065,0.795455
1,150,77,16557,-0.016622,0.823684
2,200,55,17046,0.007973,0.855556
3,250,46,16398,0.064163,0.891111


In [50]:
%%time
askwomen_stats = test_hyperparameters(askwomen_sample_docs, askwomen_sample_embeddings, cluster_sizes)
askwomen_stats #no seed topics

Wall time: 14min 22s


Unnamed: 0,min_topic_size,num_topics,num_outliers,topic_coherence,topic_diversity
0,10,1137,15621,-0.240899,0.221391
1,25,470,15886,-0.219038,0.436674
2,50,224,14908,-0.144297,0.653812
3,75,151,15654,-0.107822,0.744
4,100,128,15297,-0.081316,0.767717


In [59]:
#visualize with best parameters 
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=350, 
                                min_samples=1,
                                metric='euclidean', #default
                                cluster_selection_method='eom',
                                prediction_data=True) 
topic_model = BERTopic(
                        hdbscan_model = hdbscan_model,
                        umap_model = umap_model,
                        #seed_topic_list = seed_topic_list,
                        vectorizer_model = vectorizer_model,
                        ctfidf_model = ctfidf_model,
                        calculate_probabilities = False,  #faster runtime
                       )

topics, _ = topic_model.fit_transform(askmen_sample_docs, askmen_sample_embeddings)
topic_model.visualize_documents(askmen_sample_docs, embeddings=askmen_sample_embeddings, sample=.1)

In [61]:
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=400, 
                                min_samples=1,
                                metric='euclidean', #default
                                cluster_selection_method='eom',
                                prediction_data=True) 
topic_model = BERTopic(
                        hdbscan_model = hdbscan_model,
                        umap_model = umap_model,
                        #seed_topic_list = seed_topic_list,
                        vectorizer_model = vectorizer_model,
                        ctfidf_model = ctfidf_model,
                        calculate_probabilities = False,  #faster runtime
                       )

topics, _ = topic_model.fit_transform(askmen_sample_docs, askmen_sample_embeddings)
topic_model.visualize_documents(askmen_sample_docs, embeddings=askmen_sample_embeddings, sample=.1)