In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import optuna
import spacy
from spacy_cleaner import processing, Cleaner
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
def split_documents_by_words(documents, max_words=512):
    """
    Split documents if one document's word count is over than max_words.
    
    Args:
        documents (list): List of documents as strings.
        max_words (int): Maximum number of words for each document.
    
    Returns:
        list: List of split documents.
    """
    split_documents = []
    for doc in documents:
        words = doc.split()
        num_words = len(words)
        if num_words <= max_words:
            split_documents.append(doc)
        else:
            # Split document into segments of max_words
            num_segments = num_words // max_words
            for i in range(num_segments + 1):
                start_idx = i * max_words
                end_idx = (i + 1) * max_words
                if ' '.join(words[start_idx:end_idx]) != '' or ' '.join(words[start_idx:end_idx]) != ' ':
                    split_documents.append(' '.join(words[start_idx:end_idx]))
    return split_documents 
 

df = pd.read_json('/home/yy2046/Workspace/DCEE2023/datasets/theguardian/all_keywords_data/guardian_all_data', lines=True)
df.drop_duplicates(subset=['title'], inplace=True)

data = [row.title + ' ' + str(row.content['body']) for index, row in df.iterrows()]

''' preprocess '''
cleaned_data = []
model = spacy.load("en_core_web_sm")
cleaner = Cleaner( 
    model,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_email_token,
    processing.remove_url_token,
    processing.mutate_lemma_token,
    
)

for html_text in data:
    soup = BeautifulSoup(html_text, 'html.parser')
    soup_text = soup.get_text().lower()
    cleaned_data.append(soup_text)
# print(cleaned_data[0])
print('spaCy preprocess start!')
cleaned_data = cleaner.clean(cleaned_data)
# print(cleaned_data[0])
print('spaCy preprocess done!')
 
input_data = split_documents_by_words(cleaned_data, max_words=512)

embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
print('Model supported the max length of a document: ', embedding_model.max_seq_length)
umap_model = UMAP(n_neighbors=10, n_components=15, random_state=42)
cluster_model = KMeans(n_clusters=20, random_state=42)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 1))
representation_model = KeyBERTInspired(top_n_words=10, random_state=42)

topic_model = BERTopic(
    embedding_model=embedding_model,
    top_n_words=10,
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,

)
topics, probs = topic_model.fit_transform(input_data)
print(topic_model.get_topic_info())
# print(topic_model.get_topic_info())
topic_model.get_topic_info().to_csv('topic_modelling_topic_info.csv')
topic_model.get_document_info(input_data).to_csv(
    'topic_modelling_docs_info.csv')
for i in range(20):
    word_list = []
    prob_list = []
    topic = topic_model.get_topic(i)
    for j in topic:
        word_list.append(j[0])
        prob_list.append(j[1])
    pd.DataFrame({
        'word': word_list,
        'prob': prob_list
    }).to_excel('topic_bert' + str(i) + '.xlsx')
    print(topic_model.get_topic(i)) 
'''coherence computation'''
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_data]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
            for topic in range(len(set(topics))-1)]
coherence_model = CoherenceModel(topics=topic_words, 
                                texts=tokens, 
                                corpus=corpus,
                                dictionary=dictionary, 
                                coherence='c_npmi')
coherence = coherence_model.get_coherence()

print(coherence)

In [6]:
results = []
for i in range (20):
    path = '/home/yy2046/Workspace/DCEE2023/results/bert_selected_hps_guardian_res/topic_bert'+ str(i)+'.xlsx'
    df = pd.read_excel(path)
    print(df['word'].tolist())
    results.append(df['word'].tolist())
    pd.DataFrame(results).to_csv('/home/yy2046/Workspace/DCEE2023/results/bert_selected_hps_guardian_res/111.csv')

['australia', 'australian', 'nsw', 'election', 'labor', 'bst', 'senator', 'coalition', 'need', 'leader']
['renewable', 'carbon', 'energy', 'emission', 'international', 'environment', 'world', 'resource', 'goal', 'climate']
['energy', 'renewable', 'uk', 'gas', 'fuel', 'carbon', 'solar', 'britain', 'emission', 'electricity']
['mp', 'corbyn', 'eu', 'britain', 'tory', 'uk', 'brexit', 'labour', 'conservative', 'british']
['recycling', 'recycle', 'recyclable', 'plastic', 'waste', 'reuse', 'circular', 'packaging', 'environment', 'environmental']
['sustainable', 'environment', 'farming', 'green', 'land', 'grow', 'plan', 'uk', 'farmer', 'produce']
['sustainability', 'sustainable', 'environment', 'approach', 'resource', 'ethical', 'consumer', 'environmental', 'growth', 'future']
['australia', 'energy', 'renewable', 'nsw', 'australian', '2050', 'emission', 'gas', 'fuel', 'carbon']
['candidate', 'biden', 'trump', 'election', 'voter', 'republicans', 'official', 'democrat', 'campaign', 'senator']
['