In [None]:
import re
import numpy as np
import pandas as pd
import scipy.sparse as ss
from bs4 import BeautifulSoup
import spacy
from spacy_cleaner import processing, Cleaner
from corextopic import corextopic as ct
from corextopic import vis_topic as vt
from gensim.corpora.dictionary import Dictionary
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
import optuna
from octis.evaluation_metrics.coherence_metrics import Coherence


def split_documents_by_words(documents, max_words=512):
    """
    Split documents if one document's word count is over than max_words.

    Args:
        documents (list): List of documents as strings.
        max_words (int): Maximum number of words for each document.

    Returns:
        list: List of split documents.
    """
    split_documents = []
    for doc in documents:
        words = doc.split()
        num_words = len(words)
        if num_words <= max_words:
            split_documents.append(doc)
        else:
            # Split document into segments of max_words
            num_segments = num_words // max_words
            for i in range(num_segments + 1):
                start_idx = i * max_words
                end_idx = (i + 1) * max_words
                if ' '.join(words[start_idx:end_idx]) != '' or ' '.join(words[start_idx:end_idx]) != ' ':
                    split_documents.append(' '.join(words[start_idx:end_idx]))
    return split_documents

df = pd.read_json('/home/yy2046/Workspace/DCEE2023/datasets/theguardian/all_keywords_data/guardian_all_data', lines=True)
df.drop_duplicates(subset=['title'], inplace=True)

data = [row.title + ' ' + str(row.content['body']) for index, row in df.iterrows()]

''' preprocess '''
cleaned_data = []
model = spacy.load("en_core_web_sm")
cleaner = Cleaner(
    model,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_email_token,
    processing.remove_url_token,
    processing.mutate_lemma_token,

)

for html_text in data:
    soup = BeautifulSoup(html_text, 'html.parser')
    soup_text = soup.get_text().lower()
    cleaned_data.append(soup_text)

print('spaCy preprocess start!')
cleaned_data = cleaner.clean(cleaned_data)
# print(cleaned_data[0])
print('spaCy preprocess done!')

docs_list = split_documents_by_words(cleaned_data, max_words=512)

seed_topic_list = [["reduce"], ["reuse"], ["recycle"]]
vectorizer = CountVectorizer(stop_words='english',
                             max_features=20000,
                             binary=True)

doc_word = vectorizer.fit_transform(docs_list)
doc_word = ss.csr_matrix(doc_word)
words = list(np.asarray(vectorizer.get_feature_names_out()))
topic_model = ct.Corex(n_hidden=20,
                    words=words,
                    verbose=False,
                    seed=42)
topic_model.fit(doc_word,
                words=words,
                anchors=seed_topic_list,
                anchor_strength=5.5)

'''coherence computation'''
corpus = [ doc.split(' ') for doc in cleaned_data]
npmi = Coherence(texts=corpus, topk=10, measure='c_npmi')

results = topic_model.get_topics()
# print(results)
extracted_words = [[item[0] for item in temp] for temp in results]

# print(extracted_words)
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))
try:
    npmi_score = npmi.score({'topics':extracted_words})
except:
    npmi_score = -99

print(npmi_score)