## Reddit

In [6]:
import os
import numpy as np
import pandas as pd
import optuna
import spacy
from bs4 import BeautifulSoup
from optuna.samplers import TPESampler
from spacy_cleaner import processing, Cleaner
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import InvertedRBO
from umap import UMAP

# Set the TOKENIZERS_PARALLELISM environment variable to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Function to split documents into chunks with a maximum word count
def split_documents_by_words(documents, max_words=512):
    split_documents = []
    for doc in documents:
        words = doc.split()
        num_words = len(words)
        if num_words <= max_words:
            split_documents.append(doc)
        else:
            num_segments = num_words // max_words
            for i in range(num_segments + 1):
                start_idx = i * max_words
                end_idx = (i + 1) * max_words
                segment = ' '.join(words[start_idx:end_idx])
                if segment.strip():
                    split_documents.append(segment)
    return split_documents 

# Function to calculate topic coherence using Gensim's CoherenceModel
def calculate_coherence(topic_model, topics, documents):
    vectoriser = topic_model.vectorizer_model
    analyser = vectoriser.build_analyzer()
    tokens = [analyser(doc) for doc in documents]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
                   for topic in range(len(set(topics))-1)]
    
    coherence_model = CoherenceModel(
        topics=topic_words, 
        texts=tokens, 
        corpus=corpus,
        dictionary=dictionary, 
        coherence='c_npmi'
    )
    return coherence_model.get_coherence()

# Function to calculate diversity using InvertedRBO
def calculate_diversity(topic_model):
    topics = topic_model.get_topics()
    topic_words = [[word for word, _ in words] for _, words in topics.items()]
    model_output = {"topics": topic_words}
    diversity_model = InvertedRBO()
    return diversity_model.score(model_output)

# Function to calculate perplexity from probabilities
def calculate_perplexity(probs):
    if probs is None or probs.size == 0:
        return float('inf')
    
    probs = np.clip(probs, 1e-10, None)
    log_perplexity = -1 * np.mean(np.log(np.sum(probs, axis=1)))
    return np.exp(log_perplexity)

# Objective function for Optuna optimization
def objective(trial):
    """
    Optuna objective function for hyperparameter optimization of BERTopic model.
    """
    # Fixed hyperparameters
    n_gram = 1
    n_clusters = 14
    n_components = 13
    n_neighbors = 15
    
    embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
    print(f'Starting trial with n_gram={n_gram}, n_clusters={n_clusters}, n_components={n_components}, n_neighbors={n_neighbors}')
    print('Model supported the max length of a document: ', embedding_model.max_seq_length)
    
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, random_state=42)
    cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, n_gram))
    representation_model = KeyBERTInspired(top_n_words=10, random_state=42)
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        top_n_words=10,
        umap_model=umap_model,
        hdbscan_model=cluster_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        calculate_probabilities=True
    )
    
    # Fit model and calculate scores
    topics, _ = topic_model.fit_transform(input_data)
    probs, _ = topic_model.approximate_distribution(input_data)

    coherence = calculate_coherence(topic_model, topics, input_data)
    diversity = calculate_diversity(topic_model)
    perplexity = calculate_perplexity(probs)
    
    # Save the topic information to a CSV file
    topic_model.get_topic_info().to_csv("bertopic_moo_topics_reddits.csv", index=False)
    
    return coherence, diversity, perplexity

# Load and preprocess data
print('Loading data...')
df = pd.read_json('/datasets/reddit/dcee_reddit.json')
df.drop_duplicates(subset=['title', 'selftext'], inplace=True)
data = [row.title + ' ' + str(row.selftext) for index, row in df.iterrows()]

print('Starting preprocessing...')
cleaned_data = []
model = spacy.load("en_core_web_sm")
cleaner = Cleaner(
    model,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_email_token,
    processing.remove_url_token,
    processing.mutate_lemma_token,
)

# Clean and preprocess text
for html_text in data:
    soup = BeautifulSoup(html_text, 'html.parser')
    soup_text = soup.get_text().lower()
    cleaned_data.append(soup_text)

print('spaCy preprocess start!')
cleaned_data = cleaner.clean(cleaned_data)
print('spaCy preprocess done!')

# Split documents into chunks of max 512 words
input_data = split_documents_by_words(cleaned_data, max_words=512)
print('Document splitting complete.')

# Start the hyperparameter optimization
print('Starting hyperparameter optimisation...')
study = optuna.create_study(sampler=TPESampler(), directions=["maximize", "maximize", "minimize"])
study.optimize(objective, n_trials=1)


Loading data...
Starting preprocessing...
spaCy preprocess start!


Cleaning Progress: 100%|██████████| 708/708 [00:02<00:00, 279.46it/s]
[I 2024-09-04 15:49:14,104] A new study created in memory with name: no-name-7b8771c1-e523-478b-b9cb-c66568e9e7ee


spaCy preprocess done!
Document splitting complete.
Starting hyperparameter optimisation...
Starting trial with n_gram=1, n_clusters=14, n_components=13, n_neighbors=15
Model supported the max length of a document:  512


[I 2024-09-04 15:49:21,184] Trial 0 finished with values: [-0.2626720706778148, 0.855847886967033, 8.182748959518293] and parameters: {}. 


## Twitter

In [7]:
import os
import numpy as np
import pandas as pd
import optuna
import spacy
from bs4 import BeautifulSoup
from optuna.samplers import TPESampler
from spacy_cleaner import processing, Cleaner
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import InvertedRBO
from umap import UMAP

# Set the TOKENIZERS_PARALLELISM environment variable to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Function to split documents into chunks with a maximum word count
def split_documents_by_words(documents, max_words=512):
    split_documents = []
    for doc in documents:
        words = doc.split()
        num_words = len(words)
        if num_words <= max_words:
            split_documents.append(doc)
        else:
            for i in range(0, num_words, max_words):
                segment = ' '.join(words[i:i + max_words])
                if segment.strip():
                    split_documents.append(segment)
    return split_documents 

# Function to calculate topic coherence using Gensim's CoherenceModel
def calculate_coherence(topic_model, topics, documents):
    vectoriser = topic_model.vectorizer_model
    analyser = vectoriser.build_analyzer()
    tokens = [analyser(doc) for doc in documents]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]
    
    coherence_model = CoherenceModel(
        topics=topic_words, 
        texts=tokens, 
        corpus=corpus,
        dictionary=dictionary, 
        coherence='c_npmi'
    )
    return coherence_model.get_coherence()

# Function to calculate diversity using InvertedRBO
def calculate_diversity(topic_model):
    topics = topic_model.get_topics()
    topic_words = [[word for word, _ in words] for _, words in topics.items()]
    model_output = {"topics": topic_words}
    diversity_model = InvertedRBO()
    return diversity_model.score(model_output)

# Function to calculate perplexity from probabilities
def calculate_perplexity(probs):
    if probs is None or probs.size == 0:
        return float('inf')
    
    probs = np.clip(probs, 1e-10, None)
    log_perplexity = -1 * np.mean(np.log(np.sum(probs, axis=1)))
    return np.exp(log_perplexity)

# Objective function for Optuna optimization
def objective(trial):
    """
    Optuna objective function for hyperparameter optimization of BERTopic model.
    """
    n_gram = 1
    n_clusters = 8
    n_components = 7
    n_neighbors = 15
    
    embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
    print(f'Starting trial with n_gram={n_gram}, n_clusters={n_clusters}, n_components={n_components}, n_neighbors={n_neighbors}')
    print('Model supports the max length of a document: ', embedding_model.max_seq_length)
    
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, random_state=42)
    cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, n_gram))
    representation_model = KeyBERTInspired(top_n_words=10, random_state=42)
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        top_n_words=10,
        umap_model=umap_model,
        hdbscan_model=cluster_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
    )
    
    topics, _ = topic_model.fit_transform(input_data)
    probs, _ = topic_model.approximate_distribution(input_data)
    
    coherence = calculate_coherence(topic_model, topics, input_data)
    diversity = calculate_diversity(topic_model)
    perplexity = calculate_perplexity(probs)
    
    topic_model.get_topic_info().to_csv("bertopic_moo_topics_twitter.csv", index=False)
    return coherence, diversity, perplexity

# Load and preprocess data
print('Loading data...')
df = pd.read_csv('/datasets/twitter/cleaned_tweets.csv', encoding='unicode_escape')
data = df['full_text']

# Preprocess
print('Starting preprocessing...')
cleaned_data = []
model = spacy.load("en_core_web_sm")
cleaner = Cleaner(
    model,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_email_token,
    processing.remove_url_token,
    processing.mutate_lemma_token,
)

for html_text in data:
    soup = BeautifulSoup(html_text, 'html.parser')
    soup_text = soup.get_text().lower()
    cleaned_data.append(soup_text)

print('spaCy preprocess start!')
cleaned_data = cleaner.clean(cleaned_data)
print(f'Number of cleaned documents: {len(cleaned_data)}')

# Split documents into chunks
input_data = split_documents_by_words(cleaned_data, max_words=512)
print('Document splitting complete.')

# Start the hyperparameter optimization
print('Starting hyperparameter optimisation...')
study = optuna.create_study(sampler=TPESampler(), directions=["maximize", "maximize", "minimize"])
study.optimize(objective, n_trials=1)


Starting preprocessing...
spaCy preprocess start!


Cleaning Progress: 100%|██████████| 3921/3921 [00:06<00:00, 653.46it/s]
[I 2024-09-04 15:52:10,651] A new study created in memory with name: no-name-1a665a26-5fb0-44f0-ad49-a15476364159


Number of cleaned documents: 3921
Document splitting complete.
Starting hyperparameter optimisation...
Starting trial with n_gram=1, n_clusters=8, n_components=7, n_neighbors=15
Model supported the max length of a document:  512


[I 2024-09-04 15:52:36,672] Trial 0 finished with values: [-0.011112568487815349, 0.9486033017372449, 74.92023928231484] and parameters: {}. 


## Guardian

In [8]:
import os
import numpy as np
import pandas as pd
import optuna
import spacy
from bs4 import BeautifulSoup
from spacy_cleaner import processing, Cleaner
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import InvertedRBO
from umap import UMAP

# Set environment variable to avoid parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Function to split documents into chunks with a maximum word count
def split_documents_by_words(documents, max_words=512):
    split_documents = []
    for doc in documents:
        words = doc.split()
        if len(words) <= max_words:
            split_documents.append(doc)
        else:
            for i in range(0, len(words), max_words):
                segment = ' '.join(words[i:i + max_words])
                if segment.strip():
                    split_documents.append(segment)
    return split_documents

# Function to calculate topic coherence using c_npmi metric
def calculate_coherence(topic_model, documents):
    topics = topic_model.get_topics()
    topic_words = [[word for word, _ in words] for _, words in topics.items()]
    
    coherence_model = Coherence(
        texts=[doc.split() for doc in documents], 
        topk=10, 
        measure='c_npmi'
    )
    model_output = {"topics": topic_words}
    coherence_score = coherence_model.score(model_output)
    
    return coherence_score

# Function to calculate topic diversity using InvertedRBO metric
def calculate_diversity(topic_model):
    topics = topic_model.get_topics()
    topic_words = [[word for word, _ in words] for _, words in topics.items()]
    
    model_output = {"topics": topic_words}
    diversity_model = InvertedRBO()
    diversity_score = diversity_model.score(model_output)
    
    return diversity_score

# Function to calculate perplexity
def calculate_perplexity(probs):
    if probs is None or probs.size == 0:
        return float('inf')
    
    probs = np.clip(probs, 1e-10, None)  # Avoid division by zero
    log_perplexity = -1 * np.mean(np.log(np.sum(probs, axis=1)))
    
    return np.exp(log_perplexity)

# Objective function for Optuna optimization
def objective(trial):
    n_gram = 1
    n_clusters = 20
    n_components = 11
    n_neighbors = 19
    
    embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, random_state=42)
    cluster_model = KMeans(n_clusters=n_clusters, random_state=42)
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, n_gram))
    representation_model = KeyBERTInspired(top_n_words=10, random_state=42)
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        top_n_words=10,
        umap_model=umap_model,
        hdbscan_model=cluster_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        calculate_probabilities=True
    )
    
    topic_model.fit_transform(input_data)
    probs, _ = topic_model.approximate_distribution(input_data)
    
    coherence = calculate_coherence(topic_model, input_data)
    diversity = calculate_diversity(topic_model)
    perplexity = calculate_perplexity(probs)
    
    topic_model.get_topic_info().to_csv("bertopic_moo_topics_guardian.csv", index=False)
    
    return coherence, diversity, perplexity

# Load and preprocess data
df = pd.read_json('/datasets/theguardian/dcee_guardian', lines=True)
df.drop_duplicates(subset=['title'], inplace=True)
data = [f"{row.title} {str(row.content['body'])}" for _, row in df.iterrows()]

# Text cleaning and preprocessing
cleaned_data = []
model = spacy.load("en_core_web_sm")
cleaner = Cleaner(
    model,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_email_token,
    processing.remove_url_token,
    processing.mutate_lemma_token,
)

for html_text in data:
    soup = BeautifulSoup(html_text, 'html.parser')
    soup_text = soup.get_text().lower()
    cleaned_data.append(soup_text)

cleaned_data = cleaner.clean(cleaned_data)
input_data = split_documents_by_words(cleaned_data, max_words=512)

# Hyperparameter optimization using Optuna
study = optuna.create_study(
    directions=["maximize", "maximize", "minimize"], 
    sampler=optuna.samplers.TPESampler()
)
study.optimize(objective, n_trials=1)


Cleaning Progress: 100%|██████████| 17477/17477 [37:31<00:00,  7.76it/s]  
[I 2024-09-04 16:34:40,987] A new study created in memory with name: no-name-180a9c0c-7ac5-458d-af61-9ed7ed38019a
[I 2024-09-04 16:41:56,737] Trial 0 finished with values: [0.13809031691180196, 0.9886139268051504, 1.4469154722396265] and parameters: {}. 
