In [82]:
import os
import re
import json
import pandas as pd
import gensim
import nltk
from sklearn.model_selection import train_test_split
from octis.models.NMF import NMF
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.coherence_metrics import Coherence
from sklearn import metrics
import numpy as np

### Load Data

In [83]:
# Load the BBC News dataset
bbc_news = pd.read_csv("/home/patsias/Essential Text/Comparing-Different-Topic-Modeling-Methods-on-News/bbc-news-data.csv", sep="\t")
bbc_news['text'] = bbc_news['title'] + " " + bbc_news['content']

### Splitting the Data into Training and Test Sets 
### with a 20% Test Portion

In [84]:
# Split the dataset into training and test sets
train, test = train_test_split(bbc_news, test_size=0.2, random_state=42)
train['div'] = 'train'
test['div'] = 'test'


bbc_news_split = pd.concat([train, test]).reset_index(drop=True)

In [85]:
bbc_news_split.groupby(['div','category']).size().reset_index()

Unnamed: 0,div,category,0
0,test,business,115
1,test,entertainment,72
2,test,politics,76
3,test,sport,102
4,test,tech,80
5,train,business,395
6,train,entertainment,314
7,train,politics,341
8,train,sport,409
9,train,tech,321


In [86]:
import re
import string
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english')) 

def preprocess_text(text):
    # Remove punctuation using regex
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text) 
    text = ' '.join(text.split())
    # Tokenize words, remove stopwords, and convert back to string
    words = text.split() 
    words = [word for word in words if word not in stop_words]  

    return " ".join(words)   




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
bbc_news_split['preprocessed_text'] = bbc_news_split['text'].apply(preprocess_text)


In [88]:
bbc_news_split['tok'] = bbc_news_split['preprocessed_text'].apply(lambda x: set(x.split()))
	

train_docs = bbc_news_split[bbc_news_split['div']=='train']['tok'].to_numpy() 
dictionary = gensim.corpora.Dictionary(train_docs) 

bbc_news_split['corpus'] = [dictionary.doc2bow(doc) for doc in bbc_news_split['tok'].to_numpy()]

In [89]:
from sklearn import metrics
def q_metrics(y_true1, y_pred1,my_model=None):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true1, y_pred1)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('purity_score:',purity)
    print('NMI:',metrics.normalized_mutual_info_score(y_true1, y_pred1))

In [90]:

os.makedirs('bbc_octis', exist_ok=True)

# Save the vocabulary (for OCTIS)
vocab_length = len(dictionary)
with open("bbc_octis/vocabulary.txt", "w", encoding='utf8') as f:
    for i in range(vocab_length):
        f.write(dictionary[i] + '\n')

# Create Bag-of-Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc) for doc in bbc_news_split['tok']]
bbc_news_split['corpus'] = bow_corpus


# Save the corpus as 'corpus.tsv'
bbc_news_split[['preprocessed_text', 'div', 'category']].to_csv("bbc_octis/corpus.tsv", sep='\t', index=False, header=False)

# Create metadata.json for categories
labels = bbc_news_split['category'].tolist()

# Write labels to metadata.json
metadata = {"labels": labels}
with open("bbc_octis/metadata.json", "w") as outfile:
    json.dump(metadata, outfile)

print("metadata.json has been successfully created.")

metadata.json has been successfully created.


### NMF

In [91]:
TOPICS = 5

### Optimizing Parameter Selection for Best Model Performance

In [109]:
import time
def calculate_coherence_score(num_topics, kappa, w_max_iter, h_max_iter):
    # Initialize model
    model = NMF(num_topics=num_topics, chunksize=2000, passes=15, kappa=kappa,
                minimum_probability=0.02, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter=h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True, random_state=42)

    # Load dataset
    bbc_dataset = Dataset()
    bbc_dataset.load_custom_dataset_from_folder('bbc_octis')

    # Measure training time
    start_train = time.time()
    nmf_output = model.train_model(bbc_dataset)
    end_train = time.time()
    training_time = end_train - start_train

    # Measure response time
    start_response = time.time()
    test_res = nmf_output['test-topic-document-matrix'].T
    pred_test = [np.argmax(res) for res in test_res]
    end_response = time.time()
    response_time = end_response - start_response
 

    # Load true labels from the dataset
    df = pd.read_csv("bbc_octis/corpus.tsv", sep='\t', header=None)
    y_true = df[df[1] == 'test'][2].values  

    # Evaluate metrics
    q_metrics(y_true, pred_test)

    # Calculate Coherence
    coherence = Coherence(texts=bbc_dataset.get_corpus(), topk=10, measure='c_v')
    coherence_score = coherence.score(nmf_output)
    print(f"Coherence Score: {coherence_score}")

    return training_time, response_time, coherence_score


# List containing various hyperparameters
kappa = [0.5, 1.0, 1.5]
w_max_iter = [50, 100, 150, 200]
h_max_iter = [50, 100, 150, 200]

for w in w_max_iter:
    for h in h_max_iter:
        for k in kappa:
            training_time, response_time, coherence_score = calculate_coherence_score(num_topics=TOPICS, kappa=k, w_max_iter=w, h_max_iter=h)
            print(f"w_max_iter : {w} ; kappa : {k} ; h_max_iter : {h} ")
            print(f"Training Time: {training_time:.2f} seconds, Response Time: {response_time:.2f} seconds, Coherence Score: {coherence_score}\n")

purity_score: 0.7033707865168539
NMI: 0.4511664352718256
Coherence Score: 0.4870866925406613
w_max_iter : 50 ; kappa : 0.5 ; h_max_iter : 50 
Training Time: 11.40 seconds, Response Time: 0.00 seconds, Coherence Score: 0.4870866925406613

purity_score: 0.6786516853932584
NMI: 0.4415069933786194
Coherence Score: 0.5436698981188819
w_max_iter : 50 ; kappa : 1.0 ; h_max_iter : 50 
Training Time: 10.62 seconds, Response Time: 0.00 seconds, Coherence Score: 0.5436698981188819

purity_score: 0.6022471910112359
NMI: 0.30306942737919684
Coherence Score: 0.5236676454036637
w_max_iter : 50 ; kappa : 1.5 ; h_max_iter : 50 
Training Time: 10.38 seconds, Response Time: 0.00 seconds, Coherence Score: 0.5236676454036637

purity_score: 0.7033707865168539
NMI: 0.4511664352718256
Coherence Score: 0.4870866925406613
w_max_iter : 50 ; kappa : 0.5 ; h_max_iter : 100 
Training Time: 10.77 seconds, Response Time: 0.00 seconds, Coherence Score: 0.4870866925406613

purity_score: 0.6674157303370787
NMI: 0.418877

### Best Model
### best parameter:
w_max_iter = 50,  kappa = 0.5,  h_max_iter = 50 \
purity_score: 0.703 \
NMI: 0.451 \
Coherence Score: 0.487 

In [114]:
w_max_iter = 50 ; kappa = 0.5 ; h_max_iter = 50
model=NMF(num_topics=5, chunksize=2000, passes=15, kappa=kappa,
                minimum_probability=0.02, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter= h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True,random_state=42)

bbc_dataset = Dataset()
bbc_dataset.load_custom_dataset_from_folder('bbc_octis')
nmf_output = model.train_model(bbc_dataset)

test_res = nmf_output['test-topic-document-matrix'].T
pred_test = [np.argmax(res) for res in test_res]

# Load true labels from the dataset
df = pd.read_csv("bbc_octis/corpus.tsv", sep='\t', header=None)
y_true = df[df[1] == 'test'][2].values  # Assuming 2nd column is 'category'

y_pred = pred_test
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
coherence = Coherence(texts=bbc_dataset.get_corpus(), topk=10, measure='c_v')
coherence_score = coherence.score(nmf_output)
print(f"Coherence Score: {coherence_score}")

purity_score: 0.7033707865168539
NMI: 0.4511664352718256
Coherence Score: 0.4870866925406613


In [115]:
train_res = nmf_output['topic-document-matrix'].T
pred = [np.argmax(res) for res in train_res]


In [116]:
from scipy import stats
  
#Create a DataFrame for training data
train_df = pd.DataFrame({
    'topic': pred,
    'label': bbc_news_split[bbc_news_split['div']=='train']['category']
})

#Group by 'topic' and count how many of each label there are for each topic in the training data
train_topic_label_counts = train_df.groupby(['topic', 'label']).size().unstack(fill_value=0)

#Assign the mode (most frequent label) for each topic in the training data
topic_to_mode_label = train_df.groupby('topic')['label'].agg(lambda x: x.mode().iloc[0])

#Map the predicted test topics to actual labels using the mapping from training data
mapped_test_labels = [topic_to_mode_label.get(topic, None) for topic in pred_test]

# Create a DataFrame to combine mapped test labels and actual test labels
test_results_df = pd.DataFrame({
    'predicted_label': mapped_test_labels,
    'true_label':bbc_news_split[bbc_news_split['div']=='test']['category']
})

### Metrics

In [117]:
from sklearn.metrics import classification_report

print(classification_report( test_results_df['true_label'],test_results_df['predicted_label']))

               precision    recall  f1-score   support

     business       0.53      0.82      0.64       115
entertainment       0.90      0.50      0.64        72
     politics       0.72      0.72      0.72        76
        sport       0.86      0.84      0.85       102
         tech       0.84      0.53      0.65        80

     accuracy                           0.70       445
    macro avg       0.77      0.68      0.70       445
 weighted avg       0.75      0.70      0.70       445

