In [None]:
import os
import re
import json
import pandas as pd
import gensim
import nltk
from sklearn.model_selection import train_test_split
from octis.models.NMF import NMF
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.coherence_metrics import Coherence
from sklearn import metrics
import numpy as np

# Load Data

In [None]:
# Load the BBC News dataset
bbc_news = pd.read_csv("/home/patsias/Essential Text/Comparing-Different-Topic-Modeling-Methods-on-News/bbc-news-data.csv", sep="\t")
bbc_news['text'] = bbc_news['title'] + " " + bbc_news['content']

In [None]:
# Split the dataset into training and test sets
train, test = train_test_split(bbc_news, test_size=0.2, random_state=42)
train['div'] = 'train'
test['div'] = 'test'


bbc_news_split = pd.concat([train, test]).reset_index(drop=True)

In [80]:
bbc_news_split.groupby(['div','category']).size().reset_index()

Unnamed: 0,div,category,0
0,test,business,115
1,test,entertainment,72
2,test,politics,76
3,test,sport,102
4,test,tech,80
5,train,business,395
6,train,entertainment,314
7,train,politics,341
8,train,sport,409
9,train,tech,321


In [81]:
import re
import string
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english')) 

# Define the preprocessing function
def preprocess_text(text):
    # Remove punctuation using regex
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text) 
    # Remove wite spaces
    text = ' '.join(text.split())
    # Tokenize words, remove stopwords, and convert back to string
    words = text.split() 
    words = [word for word in words if word not in stop_words]  
    
    # Return preprocessed text as a single string
    return " ".join(words)   




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
bbc_news_split['preprocessed_text'] = bbc_news_split['text'].apply(preprocess_text)


In [None]:
bbc_news_split['tok'] = bbc_news_split['preprocessed_text'].apply(lambda x: set(x.split()))
	

train_docs = bbc_news_split[bbc_news_split['div']=='train']['tok'].to_numpy() 
dictionary = gensim.corpora.Dictionary(train_docs) 

bbc_news_split['corpus'] = [dictionary.doc2bow(doc) for doc in bbc_news_split['tok'].to_numpy()]

# Train the Model

In [84]:
from sklearn import metrics
def q_metrics(y_true1, y_pred1,my_model=None):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true1, y_pred1)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('purity_score:',purity)
    print('NMI:',metrics.normalized_mutual_info_score(y_true1, y_pred1))

In [None]:

os.makedirs('bbc_octis', exist_ok=True)

# Save the vocabulary (for OCTIS)
vocab_length = len(dictionary)
with open("bbc_octis/vocabulary.txt", "w", encoding='utf8') as f:
    for i in range(vocab_length):
        f.write(dictionary[i] + '\n')

# Create Bag-of-Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc) for doc in bbc_news_split['tok']]
bbc_news_split['corpus'] = bow_corpus


# Save the corpus as 'corpus.tsv'
bbc_news_split[['preprocessed_text', 'div', 'category']].to_csv("bbc_octis/corpus.tsv", sep='\t', index=False, header=False)

# Create metadata.json for categories
labels = bbc_news_split['category'].tolist()

# Write labels to metadata.json
metadata = {"labels": labels}
with open("bbc_octis/metadata.json", "w") as outfile:
    json.dump(metadata, outfile)

print("metadata.json has been successfully created.")

metadata.json has been successfully created.


In [None]:
import time
def calculate_coherence_score(num_topics, kappa, w_max_iter, h_max_iter):
    # Initialize model
    model = NMF(num_topics=num_topics, chunksize=2000, passes=10, kappa=kappa,
                minimum_probability=0.01, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter=h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True)

    # Load dataset
    bbc_dataset = Dataset()
    bbc_dataset.load_custom_dataset_from_folder('bbc_octis')

    # Measure training time
    start_train = time.time()
    nmf_output = model.train_model(bbc_dataset)
    end_train = time.time()
    training_time = end_train - start_train
    print(f"Training Time: {training_time:.2f} seconds")

    # Measure response time
    start_response = time.time()
    test_res = nmf_output['test-topic-document-matrix'].T
    pred_test = [np.argmax(res) for res in test_res]
    end_response = time.time()
    response_time = end_response - start_response
    print(f"Response Time: {response_time:.2f} seconds")

    # Load true labels from the dataset
    df = pd.read_csv("bbc_octis/corpus.tsv", sep='\t', header=None)
    y_true = df[df[1] == 'test'][2].values  

    # Evaluate metrics
    q_metrics(y_true, pred_test)

    # Calculate Coherence
    coherence = Coherence(texts=bbc_dataset.get_corpus(), topk=10, measure='c_v')
    coherence_score = coherence.score(nmf_output)
    print(f"Coherence Score: {coherence_score}")

    return training_time, response_time, coherence_score


# List containing various hyperparameters
kappa = [0.5, 1.0, 1.5, 2.0]
w_max_iter = [50, 100, 150, 200]
h_max_iter = [50, 100, 150, 200]

for w in w_max_iter:
    for h in h_max_iter:
        for k in kappa:
            training_time, response_time, coherence_score = calculate_coherence_score(num_topics=5, kappa=k, w_max_iter=w, h_max_iter=h)
            print(f"w_max_iter : {w} ; kappa : {k} ; h_max_iter : {h} ")
            print(f"Training Time: {training_time:.2f} seconds, Response Time: {response_time:.2f} seconds, Coherence Score: {coherence_score}\n")

Training Time: 11.11 seconds
Response Time: 0.00 seconds
purity_score: 0.5370786516853933
NMI: 0.2996307242242828
Coherence Score: 0.5217211469517644
w_max_iter : 50 ; kappa : 0.5 ; h_max_iter : 50 
Training Time: 11.11 seconds, Response Time: 0.00 seconds, Coherence Score: 0.5217211469517644

Training Time: 11.74 seconds
Response Time: 0.00 seconds
purity_score: 0.501123595505618
NMI: 0.3123076630433977
Coherence Score: 0.5102706198211748
w_max_iter : 50 ; kappa : 1.0 ; h_max_iter : 50 
Training Time: 11.74 seconds, Response Time: 0.00 seconds, Coherence Score: 0.5102706198211748

Training Time: 9.46 seconds
Response Time: 0.00 seconds
purity_score: 0.5685393258426966
NMI: 0.3107148370175145
Coherence Score: 0.5034062864780905
w_max_iter : 50 ; kappa : 1.5 ; h_max_iter : 50 
Training Time: 9.46 seconds, Response Time: 0.00 seconds, Coherence Score: 0.5034062864780905

Training Time: 10.78 seconds
Response Time: 0.00 seconds
purity_score: 0.3595505617977528
NMI: 0.06001254845283924
Coh

# Best Model
### best parameter:
w_max_iter = 100,  kappa = 1.0,  h_max_iter = 200

In [110]:
w_max_iter = 100 ; kappa = 1.0 ; h_max_iter = 200
model=NMF(num_topics=5, chunksize=2000, passes=15, kappa=kappa,
                minimum_probability=0.02, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter= h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True)

bbc_dataset = Dataset()
bbc_dataset.load_custom_dataset_from_folder('bbc_octis')
nmf_output = model.train_model(bbc_dataset)

test_res = nmf_output['test-topic-document-matrix'].T
pred_test = [np.argmax(res) for res in test_res]

# Load true labels from the dataset
df = pd.read_csv("bbc_octis/corpus.tsv", sep='\t', header=None)
y_true = df[df[1] == 'test'][2].values  # Assuming 2nd column is 'category'

y_pred = pred_test
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
coherence = Coherence(texts=bbc_dataset.get_corpus(), topk=10, measure='c_v')
coherence_score = coherence.score(nmf_output)
print(f"Coherence Score: {coherence_score}")

purity_score: 0.7640449438202247
NMI: 0.5192765460661736
Coherence Score: 0.5209538731358855


In [111]:
train_res = nmf_output['topic-document-matrix'].T
pred = [np.argmax(res) for res in train_res]


In [None]:
from scipy import stats
  
#Create a DataFrame for training data
train_df = pd.DataFrame({
    'topic': pred,
    'label': bbc_news_split[bbc_news_split['div']=='train']['category']
})

#Group by 'topic' and count how many of each label there are for each topic in the training data
train_topic_label_counts = train_df.groupby(['topic', 'label']).size().unstack(fill_value=0)

#Assign the mode (most frequent label) for each topic in the training data
topic_to_mode_label = train_df.groupby('topic')['label'].agg(lambda x: x.mode().iloc[0])

#Map the predicted test topics to actual labels using the mapping from training data
mapped_test_labels = [topic_to_mode_label.get(topic, None) for topic in pred_test]

# Create a DataFrame to combine mapped test labels and actual test labels
test_results_df = pd.DataFrame({
    'predicted_label': mapped_test_labels,
    'true_label':bbc_news_split[bbc_news_split['div']=='test']['category']
})

# Metrics

In [113]:
from sklearn.metrics import classification_report


q_metrics(test_results_df['predicted_label'], test_results_df['true_label'])
print(classification_report(test_results_df['predicted_label'], test_results_df['true_label']))

purity_score: 0.7640449438202247
NMI: 0.5192765460661736
               precision    recall  f1-score   support

     business       0.83      0.69      0.75       137
entertainment       0.38      0.93      0.53        29
     politics       0.88      0.71      0.78        95
        sport       0.86      0.79      0.83       111
         tech       0.79      0.86      0.82        73

     accuracy                           0.76       445
    macro avg       0.75      0.80      0.74       445
 weighted avg       0.81      0.76      0.78       445

