In [None]:
import re
import pandas as pd
from bs4 import BeautifulSoup
import optuna
import spacy
import spacy_fastlang
from spacy_cleaner import processing, Cleaner
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [3]:
def split_documents_by_words(documents, max_words=512):
    """
    Split documents if one document's word count is over than max_words.
    
    Args:
        documents (list): List of documents as strings.
        max_words (int): Maximum number of words for each document.
    
    Returns:
        list: List of split documents.
    """
    split_documents = []
    for doc in documents:
        words = doc.split()
        num_words = len(words)
        if num_words <= max_words:
            split_documents.append(doc)
        else:
            # Split document into segments of max_words
            num_segments = num_words // max_words
            for i in range(num_segments + 1):
                start_idx = i * max_words
                end_idx = (i + 1) * max_words
                if ' '.join(words[start_idx:end_idx]) != '' or ' '.join(words[start_idx:end_idx]) != ' ':
                    split_documents.append(' '.join(words[start_idx:end_idx]))
    return split_documents 


df = pd.read_csv('/home/yy2046/Workspace/DCEE2023/datasets/twitter/twitter_junhao.csv', encoding='unicode_escape')
data = df['full_text']
    
''' preprocess '''
cleaned_data = []
model = spacy.load("en_core_web_sm")
cleaner = Cleaner( 
    model,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_email_token,
    processing.remove_url_token,
    processing.mutate_lemma_token,
    
)

for html_text in data:
    soup = BeautifulSoup(html_text, 'html.parser')
    soup_text = soup.get_text().lower()
    cleaned_data.append(soup_text)

print('spaCy preprocess start!')
cleaned_data = cleaner.clean(cleaned_data)
print(len(cleaned_data))

input_data = split_documents_by_words(cleaned_data, max_words=512)
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
print('Model supported the max length of a document: ', embedding_model.max_seq_length)
umap_model = UMAP(n_neighbors=10, n_components=10, random_state=42)
cluster_model = KMeans(n_clusters=10, random_state=42)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
representation_model = KeyBERTInspired(top_n_words=10, random_state=42)

topic_model = BERTopic(
    embedding_model=embedding_model,
    top_n_words=10,
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,

)
topics, probs = topic_model.fit_transform(input_data)
print(topic_model.get_topic_info())

'''coherence computation'''
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_data]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
            for topic in range(len(set(topics))-1)]
coherence_model = CoherenceModel(topics=topic_words, 
                                texts=tokens, 
                                corpus=corpus,
                                dictionary=dictionary, 
                                coherence='c_npmi')
coherence = coherence_model.get_coherence()

print(coherence)

spaCy preprocess start!


Cleaning Progress: 100%|██████████| 3922/3922 [00:05<00:00, 660.17it/s]


3922
Model supported the max length of a document:  512
   Topic  Count                                               Name
0      0   1264  0_circular economy_sustainability circularecon...
1      1   1065  1_circulareconomy_sustainability circularecono...
2      2    862  2_sustainablepackage recyclability_recyclabili...
3      3    484  3_monday charitynee_supportlocal mondaythought...
4      4    112  4_ban plastic_plasticwaste export_circularecon...
5      5     34  5_router processor_processor cisco_cisco 12000...
6      6     30          6_forsale cisco_c3650 48td_ws c3650_c3650
7      7     30      7_10xge port_adapter xfp_xfp port_spa 1xtenge
8      8     25  8_extreme network_24pt poe_switch extreme_summ...
9      9     16    9_roast cup_roaster chef_roaster_coffee roaster


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.13538543090342803


In [4]:
topic_model.get_topics()
for i in range(10):
    tmp = ''
    for j in topic_model.get_topic(i):
        tmp = tmp + j[0] + ','
    print(tmp)

circular economy,sustainability circulareconomy,circulareconomy sustainability,recycling,circulareconomy,recycle,wastemanagement,circular,sustainable,sustainability,
circulareconomy,sustainability circulareconomy,circulareconomy sustainability,circulareconomy circulareconomy,circular economy,sustainable future,circularity,circular,create sustainable,sustainable,
sustainablepackage recyclability,recyclability package,circulareconomy recycledmaterial,sustainablepackage,recycledmaterial resourceefficiency,package sustainability,recycle sustainability,sustainability circulareconomy,circulareconomy sustainability,circulareconomy recycle,
monday charitynee,supportlocal mondaythought,charity need,need charity,charity circulareconomy,work charity,support need,need monday,support amazing,help support,
ban plastic,plasticwaste export,circulareconomy endwastecolonialism,plastic waste,endwastecolonialism tell,wasteshipment breakfreefromplastic,mess wastetrade,waste export,petition wastetrade,waste

In [4]:
topic_model.save('bert_twitter_model', save_embedding_model=True)