In [1]:
%%capture
!pip install gdown
!pip install bertopic

In [80]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from bertopic import BERTopic
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load dataset

In [71]:
# News Groups dataset
newsgroups_data = pd.read_csv('data/fetch_7newsgroups.csv')
newsgroups_data.dropna(inplace=True,subset='label')
label_encoder_newsgroups = LabelEncoder()
newsgroups_data['topic'] = label_encoder_newsgroups.fit_transform(newsgroups_data['label'])

newsgroup_text = newsgroups_data['text'].tolist()
newsgroup_labels = newsgroups_data['topic'].tolist()


# BBC News dataset
bbc_news = pd.read_csv("data/bbc-news-data.csv",sep="\t")  # Load the BBC dataset

label_encoder_bbc = LabelEncoder()
bbc_news['topic'] = label_encoder_bbc.fit_transform(bbc_news['category'])

bbc_texts = bbc_news.apply(lambda r:r.title+r.content,axis=1).to_list()
bbc_labels = bbc_news['topic'].to_list()

In [72]:
import re
import string


def preprocess_text(text,preprocess=True):
    if not preprocess:
        return text
    # Convert text to lowercase
    text = text.lower()

    # Replace email addresses with 'EMAIL'
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'EMAIL', text)

    # Replace URLs with 'HTTP'
    text = re.sub(r'https?://\S+|www\.\S+', 'HTTP', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    text = ' '.join(text.split())# drop white spaces
    return text


In [110]:
dataset_name = 'newsgroup'

if dataset_name == 'bbc':
    documents = bbc_texts
    true_labels = bbc_labels
    test_ratio = 0.2
    do_preprocess = False
elif dataset_name == 'newsgroup':
    documents = newsgroup_text
    true_labels = newsgroup_labels
    test_ratio = 0.05
    do_preprocess = True


In [111]:
documents = [preprocess_text(doc,do_preprocess) for doc in documents]


In [112]:
# Step 2: Split the dataset 80-20 into train and test sets
train_docs, test_docs, train_labels, test_labels = train_test_split(documents, true_labels, test_size=test_ratio, random_state=42)


# **Topic Modeling with BERTopic**
<img src="https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png" width="20%">



## BERTopic
BERTopic is a topic modeling technique that leverages 🤗 transformers and a custom class-based TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

<br>



In [127]:
from sentence_transformers import SentenceTransformer

# List of sentence transformers
sentence_transformers = [
    "all-mpnet-base-v2",  # Good general-purpose model
    "multi-qa-mpnet-base-dot-v1" #Strong performance on various tasks
]
transformer = sentence_transformers[1]

In [128]:
%%time
start_time = time.time()

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, embedding_model=transformer)
topics,probs = topic_model.fit_transform(train_docs)

end_time = time.time()
training_time = end_time - start_time



2024-10-29 17:00:58,894 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/530 [00:00<?, ?it/s]

2024-10-29 17:06:25,207 - BERTopic - Embedding - Completed ✓
2024-10-29 17:06:25,208 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-29 17:06:41,261 - BERTopic - Dimensionality - Completed ✓
2024-10-29 17:06:41,263 - BERTopic - Cluster - Start clustering the reduced embeddings

os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

2024-10-29 17:08:25,375 - BERTopic - Cluster - Completed ✓
2024-10-29 17:08:25,387 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-29 17:08:29,348 - BERTopic - Representation - Completed ✓


CPU times: user 7min 38s, sys: 2.39 s, total: 7min 40s
Wall time: 7min 36s


In [129]:
freq = topic_model.get_topic_info(); freq.head(7)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5818,-1_the_to_email_for,"[the, to, email, for, is, from, of, and, in, t...",[from EMAIL michael siemon subject commandment...
1,0,600,0_gordon_banks_doctor_vitamin,"[gordon, banks, doctor, vitamin, candida, pati...",[from EMAIL subject re candidayeast bloom fact...
2,1,312,1_israel_israeli_jews_arab,"[israel, israeli, jews, arab, jewish, arabs, g...",[from EMAIL jonas flygare subject re israelis ...
3,2,189,2_db_mov_bhb_blbh,"[db, mov, bhb, blbh, byte, address, cosypak, m...",[from EMAIL s n rajesh subject looking for a j...
4,3,178,3_clayton_gay_cramer_homosexual,"[clayton, gay, cramer, homosexual, homosexuals...",[from EMAIL awainwright subject re new study o...
5,4,150,4_card_drivers_diamond_video,"[card, drivers, diamond, video, driver, ati, s...",[from EMAIL oliver p weatherbee subject new wi...
6,5,149,5_format_files_bmp_gif,"[format, files, bmp, gif, file, image, bitmap,...",[from EMAIL jihshin ho subject disp organizati...


# **Visualization**

In [130]:
# topic_model.visualize_topics()

In [131]:
# topic_model.visualize_distribution(probs[200], min_probability=0.005)

In [132]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [133]:
# topic_model.visualize_barchart(top_n_topics=8)

In [134]:
# Save model
topic_model.save(f"model/my_model_{dataset_name}_{transformer}")



In [135]:
# Load model
# my_model = BERTopic.load(f"models/my_model_{dataset_name}_{transformer}")

In [136]:
%%time
start_time = time.time()

test_topics, test_probs = topic_model.transform(test_docs)

end_time = time.time()
testing_time = end_time - start_time

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

2024-10-29 17:09:03,450 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-10-29 17:09:04,533 - BERTopic - Dimensionality - Completed ✓
2024-10-29 17:09:04,536 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-10-29 17:09:04,708 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2024-10-29 17:09:12,333 - BERTopic - Probabilities - Completed ✓
2024-10-29 17:09:12,337 - BERTopic - Cluster - Completed ✓


CPU times: user 23.4 s, sys: 407 ms, total: 23.8 s
Wall time: 24.8 s


In [137]:
# Step 5: Evaluate the model using Purity and NMI on test data

# 5.1 Helper function to compute purity
def calculate_purity(topics, true_labels):
    df = pd.DataFrame({'Topic': topics, 'True Label': true_labels})
    majority_count = 0
    for topic, group in df.groupby('Topic'):
        majority_class = group['True Label'].mode()[0]
        majority_count += (group['True Label'] == majority_class).sum()
    purity = majority_count / len(df)
    return purity

# Step 6: Compute Purity and NMI for test data
test_purity = calculate_purity(test_topics, test_labels)
test_nmi = normalized_mutual_info_score(test_labels, test_topics)

# Step 7: Output results
print(f"Test Purity: {test_purity}")
print(f"Test NMI: {test_nmi}")



Test Purity: 0.6741321388577828
Test NMI: 0.38227316204751105


In [138]:
from scipy import stats


# Step 1: Create a DataFrame for training data
train_df = pd.DataFrame({
    'topic': topics,
    'label': train_labels
})

# Step 2: Group by 'topic' and count how many of each label there are for each topic in the training data
train_topic_label_counts = train_df.groupby(['topic', 'label']).size().unstack(fill_value=0)

# Step 3: Assign the mode (most frequent label) for each topic in the training data
topic_to_mode_label = train_df.groupby('topic')['label'].agg(lambda x: stats.mode(x)[0])

# Step 4: Map the predicted test topics to actual labels using the mapping from training data
mapped_test_labels = [topic_to_mode_label.get(topic, None) for topic in test_topics]

# Create a DataFrame to combine mapped test labels and actual test labels
test_results_df = pd.DataFrame({
    'predicted_label': mapped_test_labels,
    'true_label': test_labels
})



In [139]:
from sklearn.metrics import classification_report
print(classification_report(test_results_df.true_label,test_results_df.predicted_label,digits=3))

              precision    recall  f1-score   support

           0      0.609     0.824     0.700        34
           1      0.575     0.936     0.712        78
           2      0.561     0.949     0.705        39
           3      0.591     0.969     0.734       127
           4      0.550     1.000     0.710        66
           5      0.977     0.431     0.598       497
           6      0.486     1.000     0.654        52

    accuracy                          0.664       893
   macro avg      0.621     0.872     0.688       893
weighted avg      0.795     0.664     0.647       893



In [140]:
# Generate classification report
report = classification_report(test_results_df.predicted_label,test_results_df.true_label,digits=3, output_dict=True)

report['purity'] = test_purity
report['nmi'] = test_nmi
report['training_time'] = training_time
report['testing_time'] = testing_time
# Convert the report to a DataFrame
report_df = pd.DataFrame(report).transpose()

# Specify the file path and sheet name
file_path = 'Topic_modeling_classification_report.xlsx'
sheet_name = f'{dataset_name}_{transformer}'

# Write the DataFrame to an Excel sheet
with pd.ExcelWriter(file_path, engine='openpyxl',  mode='a', if_sheet_exists='new') as writer:
    report_df.to_excel(writer, sheet_name=sheet_name)

print(f"Classification report saved to '{file_path}' in sheet '{sheet_name}'")


Classification report saved to 'Topic_modeling_classification_report.xlsx' in sheet 'newsgroup_multi-qa-mpnet-base-dot-v1'
