In [2]:
import os
import re
import json
import pandas as pd
import gensim
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string
from octis.models.NMF import NMF
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.coherence_metrics import Coherence
from sklearn import metrics
import numpy as np

In [3]:
# Load the BBC News dataset
bbc_news = pd.read_csv("/home/patsias/Essential Text/Comparing-Different-Topic-Modeling-Methods-on-News/bbc-news-data.csv", sep="\t")

In [4]:

nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))
# Define the preprocessing function
def preprocess_text(text):
    text = text.lower() 
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  
    words = text.split()  
    words = [word for word in words if word.lower() not in stop_words]  
    return " ".join(words)

In [5]:

# Combine 'title' and 'content' into a 'text' column
bbc_news['text'] = bbc_news['title'] + " " + bbc_news['content']

# Apply the preprocessing function
bbc_news['processed_text'] = bbc_news['text'].apply(preprocess_text)

# Tokenize the text into sets of words
bbc_news['tok'] = bbc_news['processed_text'].apply(lambda x: set(x.split()))

# Split the dataset into training and test sets
train, test = train_test_split(bbc_news, test_size=0.2, random_state=42)
train['div'] = 'train'
test['div'] = 'test'

# Merge back train and test into one dataframe
bbc_news_split = pd.concat([train, test]).reset_index(drop=True)

# Create a Gensim dictionary for tokenized words
train_docs = bbc_news_split[bbc_news_split['div'] == 'train'].tok.to_numpy()
dictionary = gensim.corpora.Dictionary(train_docs)




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
bbc_news_split

Unnamed: 0,category,filename,title,content,text,processed_text,tok,div
0,sport,178.txt,Cole refuses to blame van Persie,Ashley Cole has refused to blame Robin van Pe...,Cole refuses to blame van Persie Ashley Cole ...,cole refuses blame van persie ashley cole refu...,"{leaving, ljungberg, marys, sprain, refused, c...",train
1,tech,178.txt,Slimmer PlayStation triple sales,Sony PlayStation 2's slimmer shape has proved...,Slimmer PlayStation triple sales Sony PlaySta...,slimmer playstation triple sales sony playstat...,"{really, original, battlefield, may, broke, sl...",train
2,sport,260.txt,Bellamy fined after row,Newcastle have fined their Welsh striker Crai...,Bellamy fined after row Newcastle have fined ...,bellamy fined row newcastle fined welsh strike...,"{admitted, refused, game, arsenal, theres, apo...",train
3,tech,017.txt,Finding new homes for old phones,Re-using old mobile phones is not just good f...,Finding new homes for old phones Re-using old...,finding new homes old phones reusing old mobil...,"{really, buy, enjoy, friends, environmental, h...",train
4,entertainment,101.txt,Sundance to honour foreign films,International films will be given the same pr...,Sundance to honour foreign films Internationa...,sundance honour foreign films international fi...,"{independent, korea, carruth, china, include, ...",train
...,...,...,...,...,...,...,...,...
2220,entertainment,232.txt,Connick Jr to lead Broadway show,Singer and actor Harry Connick Jr is to star ...,Connick Jr to lead Broadway show Singer and a...,connick jr lead broadway show singer actor har...,"{original, ice, game, age, steam, wrote, raqui...",test
2221,business,206.txt,Standard Life cuts policy bonuses,"Standard Life, Europe's largest mutual life i...",Standard Life cuts policy bonuses Standard Li...,standard life cuts policy bonuses standard lif...,"{time, pensions, withprofits, sticking, payout...",test
2222,politics,207.txt,February poll claim 'speculation',Reports that Tony Blair is planning a snap ge...,February poll claim 'speculation' Reports tha...,february poll claim speculation reports tony b...,"{may, ultimately, around, include, government,...",test
2223,entertainment,159.txt,Band Aid 20 single storms to No 1,The new version of the Band Aid song Do They ...,Band Aid 20 single storms to No 1 The new ver...,band aid 20 single storms 1 new version band a...,"{original, grammy, chart, dido, website, singl...",test


In [7]:
# Create the directory first
os.makedirs('bbc_octis', exist_ok=True)

# Save the vocabulary (for OCTIS)
vocab_length = len(dictionary)
with open("bbc_octis/vocabulary.txt", "w", encoding='utf8') as f:
    for i in range(vocab_length):
        f.write(dictionary[i] + '\n')

# Create Bag-of-Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc) for doc in bbc_news_split['tok']]
bbc_news_split['corpus'] = bow_corpus

# Save the corpus as 'corpus.tsv'
bbc_news_split[['processed_text', 'div', 'category']].to_csv("bbc_octis/corpus.tsv", sep='\t', index=False, header=False)

# Create metadata.json for categories
# Extract labels from your dataframe
labels = bbc_news_split['category'].tolist()

# Write labels to metadata.json
metadata = {"labels": labels}
with open("bbc_octis/metadata.json", "w") as outfile:
    json.dump(metadata, outfile)

print("metadata.json has been successfully created.")

metadata.json has been successfully created.


In [8]:
# Metrics for purity and normalized mutual information (NMI)
def q_metrics(y_true, y_pred,my_model=None):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('Purity Score:', purity)
    print('NMI:', metrics.normalized_mutual_info_score(y_true, y_pred))

In [23]:


# Load the dataset prepared for OCTIS
bbc_dataset = Dataset()
bbc_dataset.load_custom_dataset_from_folder('bbc_octis')

# Define and train NMF model
def calculate_coherence_score(num_topics, kappa, w_max_iter, h_max_iter):
    model = NMF(num_topics=num_topics, chunksize=2000, passes=10, kappa=kappa,
                minimum_probability=0.01, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter=h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True)

    # Train the model
    nmf_output = model.train_model(bbc_dataset)

    # Get the test set results (document-topic matrix)
    test_res = nmf_output['test-topic-document-matrix'].T
    pred = [np.argmax(res) for res in test_res]

    # Load true labels from the dataset
    df = pd.read_csv("bbc_octis/corpus.tsv", sep='\t', header=None)
    y_true = df[df[1] == 'test'][2].values  # Assuming 2nd column is 'category'

    # Evaluate metrics
    q_metrics(y_true, pred)

    # # Calculate Coherence
    coherence = Coherence(texts=bbc_dataset.get_corpus(), topk=10, measure='c_v')
    coherence_score = coherence.score(nmf_output)
    print(f"Coherence Score: {coherence_score}")

    return coherence_score

#list containing various hyperparameters
kappa = [1.0, 2.0,3.0]
w_max_iter = [50, 100, 200]
h_max_iter = [50, 100, 200]


for w in w_max_iter:
    for h in h_max_iter:
        for k in kappa:
            calculate_coherence_score(num_topics=5, kappa=k, w_max_iter=w, h_max_iter=h)   # Coherence measures how interpretable the topics generated by the model are. A higher coherence score generally means better, more interpretable topics.
            print(f"w_max_iter : {w} ; kappa : {k} ; h_max_iter : {h} ")


Purity Score: 0.40224719101123596
NMI: 0.16238347180567467
Coherence Score: 0.4912102512292873
w_max_iter : 50 ; kappa : 1.0 ; h_max_iter : 50 
Purity Score: 0.4044943820224719
NMI: 0.08961338414097103
Coherence Score: 0.531511401881635
w_max_iter : 50 ; kappa : 2.0 ; h_max_iter : 50 
Purity Score: 0.27415730337078653
NMI: 0.009434547618065304
Coherence Score: 0.34809037467731574
w_max_iter : 50 ; kappa : 3.0 ; h_max_iter : 50 
Purity Score: 0.5258426966292135
NMI: 0.2930929034349417
Coherence Score: 0.41293875270369584
w_max_iter : 50 ; kappa : 1.0 ; h_max_iter : 100 
Purity Score: 0.36179775280898874
NMI: 0.08367728613870756
Coherence Score: 0.5297993785859021
w_max_iter : 50 ; kappa : 2.0 ; h_max_iter : 100 
Purity Score: 0.27415730337078653
NMI: 0.013832161458728886
Coherence Score: 0.3489134118838092
w_max_iter : 50 ; kappa : 3.0 ; h_max_iter : 100 
Purity Score: 0.597752808988764
NMI: 0.3351795054673556
Coherence Score: 0.5050475854559323
w_max_iter : 50 ; kappa : 1.0 ; h_max_ite

In [12]:
w_max_iter : 100 ; kappa : 1.0 ; h_max_iter : 50
model=NMF(num_topics=5, chunksize=2000, passes=10, kappa=1.0,
                minimum_probability=0.01, w_max_iter=100,
                w_stop_condition=0.0001, h_max_iter=50, h_stop_condition=0.001,
                eval_every=10, normalize=True)

bbc_dataset = Dataset()
bbc_dataset.load_custom_dataset_from_folder('bbc_octis')
nmf_output = model.train_model(bbc_dataset)

test_res = nmf_output['test-topic-document-matrix'].T
pred = [np.argmax(res) for res in test_res]

# Load true labels from the dataset
df = pd.read_csv("bbc_octis/corpus.tsv", sep='\t', header=None)
y_true = df[df[1] == 'test'][2].values  # Assuming 2nd column is 'category'

y_pred = pred
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
coherence = Coherence(texts=bbc_dataset.get_corpus(), topk=10, measure='c_v')
coherence_score = coherence.score(nmf_output)
print(f"Coherence Score: {coherence_score}")


Purity Score: 0.6
NMI: 0.33122808347575877
Coherence Score: 0.5169698059202658


In [13]:
from collections import Counter
test_res = nmf_output['test-topic-document-matrix'].T
pred = [np.argmax(res) for res in test_res]
    
temp = pd.DataFrame()
temp['y_true'] = y_true
temp['y_pred'] = pred
for i in range(5):
    print(i,'\t',Counter(temp[temp['y_pred']==i]['y_true']))

0 	 Counter({'sport': 81, 'entertainment': 19, 'tech': 9, 'business': 3, 'politics': 1})
1 	 Counter({'tech': 54, 'business': 28, 'entertainment': 8, 'politics': 6})
2 	 Counter({'politics': 42, 'business': 13, 'entertainment': 4, 'tech': 4})
3 	 Counter({'business': 71, 'politics': 27, 'entertainment': 22, 'sport': 20, 'tech': 12})
4 	 Counter({'entertainment': 19, 'tech': 1, 'sport': 1})


In [14]:
df

Unnamed: 0,0,1,2
0,cole refuses blame van persie ashley cole refu...,train,sport
1,slimmer playstation triple sales sony playstat...,train,tech
2,bellamy fined row newcastle fined welsh strike...,train,sport
3,finding new homes old phones reusing old mobil...,train,tech
4,sundance honour foreign films international fi...,train,entertainment
...,...,...,...
2220,connick jr lead broadway show singer actor har...,test,entertainment
2221,standard life cuts policy bonuses standard lif...,test,business
2222,february poll claim speculation reports tony b...,test,politics
2223,band aid 20 single storms 1 new version band a...,test,entertainment


In [15]:
from sklearn.metrics import classification_report
topic_name={0:'sport',1:'tech',2:'politics',3:'business',
            4:'entertainment'}

y_true = df[df[1]=='test'][2].to_list()
y_pred = [*map(topic_name.get, pred)]

q_metrics(y_true, pred)
print(classification_report(y_true,y_pred))

Purity Score: 0.6
NMI: 0.33122808347575877
               precision    recall  f1-score   support

     business       0.47      0.62      0.53       115
entertainment       0.90      0.26      0.41        72
     politics       0.67      0.55      0.60        76
        sport       0.72      0.79      0.75       102
         tech       0.56      0.68      0.61        80

     accuracy                           0.60       445
    macro avg       0.66      0.58      0.58       445
 weighted avg       0.65      0.60      0.59       445

