In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from gensim.models import CoherenceModel
import gensim
from gensim.models.ldamodel import LdaModel as LDA
from sklearn.metrics import normalized_mutual_info_score, f1_score, accuracy_score, confusion_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt

# Evaluation functions

In [2]:
def purity_score(y_true, y_pred):
    # Confusion matrix
    contingency_matrix = confusion_matrix(y_true, y_pred)
    # Find optimal one-to-one mapping between labels and clusters
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix)
    return contingency_matrix[row_ind, col_ind].sum() / np.sum(contingency_matrix)


# Load Data

In [2]:

# Load Datasets
# For BBC News dataset
bbc_news = pd.read_csv("/home/patsias/Essential Text/Comparing-Different-Topic-Modeling-Methods-on-News/bbc-news-data.csv",sep="\t")  # Load the BBC dataset
print(bbc_news.columns)
bbc_texts = bbc_news['content']
print(bbc_texts.head())
bbc_labels = LabelEncoder().fit_transform(bbc_news['category']) 

# For 20 Newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_texts = newsgroups_train.data
newsgroups_labels = newsgroups_train.target

df_news = pd.DataFrame({'content': newsgroups_texts, 'category': newsgroups_labels})

# Show the first few rows of the newsgroups dataset
print(df_news.head())


Index(['category', 'filename', 'title', 'content'], dtype='object')
0     Quarterly profits at US media giant TimeWarne...
1     The dollar has hit its highest level against ...
2     The owners of embattled Russian oil giant Yuk...
3     British Airways has blamed high fuel prices f...
4     Shares in UK drinks and food firm Allied Dome...
Name: content, dtype: object
                                             content  category
0  From: lerxst@wam.umd.edu (where's my thing)\nS...         7
1  From: guykuo@carson.u.washington.edu (Guy Kuo)...         4
2  From: twillis@ec.ecn.purdue.edu (Thomas E Will...         4
3  From: jgreen@amber (Joe Green)\nSubject: Re: W...         1
4  From: jcm@head-cfa.harvard.edu (Jonathan McDow...        14


In [3]:
from collections import Counter
Counter(bbc_labels)

Counter({3: 511, 0: 510, 2: 417, 4: 401, 1: 386})

In [4]:
from collections import Counter
Counter(newsgroups_labels)

Counter({10: 600,
         15: 599,
         8: 598,
         9: 597,
         11: 595,
         7: 594,
         13: 594,
         14: 593,
         5: 593,
         2: 591,
         12: 591,
         3: 590,
         6: 585,
         1: 584,
         4: 578,
         17: 564,
         16: 546,
         0: 480,
         18: 465,
         19: 377})

# Preprocess (not need)

In [7]:
# Preprocessing function
def preprocess(texts):
    vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')  # Using BoW instead of TF-IDF
    X = vectorizer.fit_transform(texts)
    return X, vectorizer#


In [8]:
# Preprocess BBC dataset
X_bbc, vectorizer_bbc = preprocess(bbc_texts)



In [9]:

# Number of topics to extract 
n_topics = 10
no_top_words = 10

In [5]:
bbc_news

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english')) 

# Define the preprocessing function
def preprocess_text(text):
    # Remove punctuation using regex
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) 
    
    # Tokenize words, remove stopwords, and convert back to string
    words = text.split() 
    words = [word for word in words if word.lower() not in stop_words]  
    
    # Return preprocessed text as a single string
    return " ".join(words)   




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import re
bbc_news['text']=bbc_news.apply(lambda r: r.title + r.content, axis=1) 
bbc_news['preprocessed_text'] = bbc_news['text'].apply(preprocess_text)

In [8]:
bbc_news['tok'] = bbc_news['preprocessed_text'].apply(lambda x: set(x.split()))  
	

In [9]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets (80% train, 20% test as an example)
train, test = train_test_split(bbc_news, test_size=0.2, random_state=42)

# Create a new column indicating 'train' or 'test'
train['div'] = 'train'
test['div'] = 'test'

# Concatenate the two sets back into a single DataFrame
bbc_news_split = pd.concat([train, test])

# Optional: Reset index if needed
bbc_news_split = bbc_news_split.reset_index(drop=True)


In [10]:
train_docs = bbc_news_split[bbc_news_split['div']=='train'].tok.to_numpy() 
dictionary = gensim.corpora.Dictionary(train_docs)  


In [11]:
# assume  the dictionary looks like this: {'machine': 0, 'learning': 1, 'data': 2, 'science': 3, 'deep': 4}   The doc2bow() function will convert each document into a list of tuples: [(0, 1), (1, 1), (2, 1)]  # 'machine' appears 1 time, 'learning' appears 1 time, 'data' appears 1 time

bow_corpus = [dictionary.doc2bow(doc) for doc in bbc_news_split.tok]

In [12]:

bbc_news_split['corpus']=bow_corpus

In [13]:
from sklearn import metrics
def q_metrics(y_true, y_pred,my_model=None):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('purity_score:',purity)
    print('NMI:',metrics.normalized_mutual_info_score(y_true, y_pred))
    
    if my_model!=None:
        cm = CoherenceModel(model=my_model, corpus=bow_corpus, dictionary=dictionary, coherence='u_mass')
        print('Coherence:',cm.get_coherence())


In [14]:
bbc_news_split['label']=bbc_news_split.category

# LDA


In [15]:

from sklearn.metrics import classification_report
df = bbc_news_split
TOPICS=5
def calculate_coherence_score(i, alpha, beta):
    lda_result=LDA(corpus=df[df['div']=='train']['corpus'], id2word=dictionary,
               iterations=i , num_topics=TOPICS, 
               chunksize=2000, random_state=42, gamma_threshold=0.001,
               passes=10, update_every=1,
               alpha=alpha,eta = beta)

    test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
    test_res = lda_result[test_corpus_bow]

    pred=[]
    for x in test_res:
        x={k[0]:k[1] for k in x}
        pred.append(max(x,key=x.get) )

    y_true = df[df['div']=='test']['label'] 
    y_pred = pred
    q_metrics(y_true, y_pred)    


    cm_lda = CoherenceModel(model=lda_result,
                          dictionary=dictionary, 
                          corpus=df[(df['div']=='train')]['corpus'], 
                          texts=df[df['div']=='train']['tok'].to_numpy(), 
                          coherence='c_v')

    # get coherence value
    coherence_lda = cm_lda.get_coherence()
        
    print('coherence_lda:', coherence_lda)
    return coherence_lda





#list containing various hyperparameters
no_of_iteration = [10,30]
alpha_list = ['symmetric',0.4,0.7]
beta_list = ['auto',0.4,0.7]


for i in no_of_iteration:
    for alpha in alpha_list:
        for beta in beta_list:
            calculate_coherence_score(i, alpha, beta)   
            print(f"i : {i} ; alpha : {alpha} ; beta : {beta} ")


purity_score: 0.8112359550561797
NMI: 0.6458833678162469
coherence_lda: 0.2847665471143438
i : 10 ; alpha : symmetric ; beta : auto 
purity_score: 0.8584269662921349
NMI: 0.7161478212910702
coherence_lda: 0.29878421813641165
i : 10 ; alpha : symmetric ; beta : 0.4 
purity_score: 0.8337078651685393
NMI: 0.6920769684914702
coherence_lda: 0.3137286281602882
i : 10 ; alpha : symmetric ; beta : 0.7 
purity_score: 0.8134831460674158
NMI: 0.6484089842431501
coherence_lda: 0.28458481130182767
i : 10 ; alpha : 0.4 ; beta : auto 
purity_score: 0.8584269662921349
NMI: 0.7157919961258384
coherence_lda: 0.29878421813641165
i : 10 ; alpha : 0.4 ; beta : 0.4 
purity_score: 0.8359550561797753
NMI: 0.6958581305399223
coherence_lda: 0.3137286281602882
i : 10 ; alpha : 0.4 ; beta : 0.7 
purity_score: 0.802247191011236
NMI: 0.6378413562694111
coherence_lda: 0.27935916887479706
i : 10 ; alpha : 0.7 ; beta : auto 
purity_score: 0.8539325842696629
NMI: 0.7115318640961922
coherence_lda: 0.29232716137484355
i 

In [21]:
# i : 30 ; alpha : 0.4 ; beta : 0.7 
i : 10 ; alpha : 0.4 ; beta : 0.7
lda_result=LDA(corpus=df[(df['div']=='train')]['corpus'], id2word=dictionary,
               iterations=i , num_topics=TOPICS,
               chunksize=2000, random_state=42, gamma_threshold=0.001,
               passes=10, update_every=1,
               alpha=alpha,eta = beta)

test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
test_res = lda_result[test_corpus_bow]

pred=[]
for x in test_res:
    x={k[0]:k[1] for k in x}
    pred.append(max(x,key=x.get) )

y_true = df[df['div']=='test']['label'] 
y_pred = pred
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
cm_lda = CoherenceModel(model=lda_result,
                          dictionary=dictionary, 
                          corpus=df[(df['div']=='train')]['corpus'], 
                          texts=df[df['div']=='train']['tok'].to_numpy(), 
                          coherence='c_v')

# get coherence value
coherence_lda = cm_lda.get_coherence()
print('coherence_lda:', coherence_lda)

purity_score: 0.7123595505617978
NMI: 0.5248798401515109
coherence_lda: 0.23049792783523815


In [22]:
pred_test=[]
for x in test_res:
    x={k[0]:k[1] for k in x}
    pred_test.append(max(x,key=x.get) )
    
temp = pd.DataFrame()
temp['y_true'] = y_true
temp['y_pred'] = pred_test
for i in range(TOPICS):
    print(i,'\t',Counter(temp[temp['y_pred']==i]['y_true']))

0 	 Counter({'entertainment': 59, 'sport': 38, 'tech': 2})
1 	 Counter({'sport': 56, 'tech': 2, 'business': 1})
2 	 Counter({'business': 86, 'tech': 28, 'politics': 3, 'entertainment': 1})
3 	 Counter({'tech': 40, 'entertainment': 6, 'business': 5})
4 	 Counter({'politics': 73, 'business': 23, 'sport': 8, 'tech': 8, 'entertainment': 6})


In [23]:
topic_words=[]
for i in range(TOPICS):
    tt = lda_result.get_topic_terms(i,20)
    topic_words.append([dictionary[pair[0]] for pair in tt])
df_topwords=pd.DataFrame(topic_words)
# df_topwords=df_topwords.T
df_topwords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,said,also,one,last,new,year,three,would,years,two,first,make,us,people,world,win,get,could,time,made
1,said,one,also,last,first,us,years,mr,would,time,made,two,year,could,set,going,added,world,three,new
2,said,would,also,new,could,one,year,two,us,first,last,years,people,time,mr,way,next,back,many,told
3,said,would,also,new,one,year,years,us,two,people,last,could,make,time,first,mr,world,three,made,back
4,said,would,mr,also,new,could,told,year,people,time,last,us,government,first,one,two,say,made,added,expected


In [24]:
df

Unnamed: 0,category,filename,title,content,text,preprocessed_text,tok,div,corpus,label
0,sport,178.txt,Cole refuses to blame van Persie,Ashley Cole has refused to blame Robin van Pe...,Cole refuses to blame van Persie Ashley Cole h...,cole refuses blame van persie ashley cole refu...,"{cup, jeremie, things, added, arsenal, aliadie...",train,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",sport
1,tech,178.txt,Slimmer PlayStation triple sales,Sony PlayStation 2's slimmer shape has proved...,Slimmer PlayStation triple sales Sony PlayStat...,slimmer playstation triple sales sony playstat...,"{obviously, games, comparison, runup, seen, gt...",train,"[(1, 1), (5, 1), (26, 1), (49, 1), (59, 1), (7...",tech
2,sport,260.txt,Bellamy fined after row,Newcastle have fined their Welsh striker Crai...,Bellamy fined after row Newcastle have fined t...,bellamy fined row newcastle fined welsh strike...,"{theres, boiled, media, asked, souness, 25, re...",train,"[(5, 1), (9, 1), (24, 1), (26, 1), (42, 1), (5...",sport
3,tech,017.txt,Finding new homes for old phones,Re-using old mobile phones is not just good f...,Finding new homes for old phones Re-using old ...,finding new homes old phones reusing old mobil...,"{collect, theres, future, romania, divide, 90,...",train,"[(1, 1), (26, 1), (35, 1), (36, 1), (52, 1), (...",tech
4,entertainment,101.txt,Sundance to honour foreign films,International films will be given the same pr...,Sundance to honour foreign films International...,sundance honour foreign films international fi...,"{theme, redford, directed, siege, benjamin, 20...",train,"[(72, 1), (77, 1), (96, 1), (98, 1), (131, 1),...",entertainment
...,...,...,...,...,...,...,...,...,...,...
2220,entertainment,232.txt,Connick Jr to lead Broadway show,Singer and actor Harry Connick Jr is to star ...,Connick Jr to lead Broadway show Singer and ac...,connick jr lead broadway show singer actor har...,"{raquin, unrest, shows, grace, added, starred,...",test,"[(1, 1), (77, 1), (98, 1), (110, 1), (167, 1),...",entertainment
2221,business,206.txt,Standard Life cuts policy bonuses,"Standard Life, Europe's largest mutual life i...",Standard Life cuts policy bonuses Standard Lif...,standard life cuts policy bonuses standard lif...,"{feel, 2006, stand, 25, trim, bonus, added, pr...",test,"[(1, 1), (36, 1), (77, 1), (96, 1), (98, 1), (...",business
2222,politics,207.txt,February poll claim 'speculation',Reports that Tony Blair is planning a snap ge...,February poll claim 'speculation' Reports that...,february poll claim speculation reports tony b...,"{britain, less, reports, government, lead, maj...",test,"[(8, 1), (74, 1), (77, 1), (78, 1), (97, 1), (...",politics
2223,entertainment,159.txt,Band Aid 20 single storms to No 1,The new version of the Band Aid song Do They ...,Band Aid 20 single storms to No 1 The new vers...,band aid 20 single storms 1 new version band a...,"{hiv, 1985, dionne, number, appear, company, d...",test,"[(5, 1), (35, 1), (59, 1), (66, 1), (77, 1), (...",entertainment


In [25]:
topic_name={0:'entertainment',1:'sport',2:'business',3:'tech',
            4:'politics'}

y_true = df[df['div']=='test']['label'].to_list()
y_pred = [*map(topic_name.get, pred)]

q_metrics(y_true, pred)
print(classification_report(y_true,y_pred))

purity_score: 0.7123595505617978
NMI: 0.5248798401515109
               precision    recall  f1-score   support

     business       0.73      0.72      0.73       115
entertainment       0.61      0.83      0.71        72
     politics       0.63      0.97      0.77        76
        sport       0.98      0.52      0.68       102
         tech       0.75      0.59      0.66        80

     accuracy                           0.71       445
    macro avg       0.74      0.73      0.71       445
 weighted avg       0.76      0.71      0.71       445

