In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from gensim.models import CoherenceModel
import gensim
from gensim.models.ldamodel import LdaModel as LDA
from sklearn.metrics import confusion_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from scipy.optimize import linear_sum_assignment

from collections import Counter

### Load Data

In [2]:

# Load Datasets
# For BBC News dataset
bbc_news = pd.read_csv("/home/patsias/Essential Text/Comparing-Different-Topic-Modeling-Methods-on-News/bbc-news-data.csv",sep="\t")  
bbc_news['text'] = bbc_news.apply(lambda r:r.title+r.content,axis=1).to_list()
bbc_news['label'] = LabelEncoder().fit_transform(bbc_news['category']) 

bbc_news.head()


Unnamed: 0,category,filename,title,content,text,label
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,Ad sales boost Time Warner profit Quarterly pr...,0
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,Dollar gains on Greenspan speech The dollar ha...,0
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,Yukos unit buyer faces loan claim The owners o...,0
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,High fuel prices hit BA's profits British Airw...,0
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,Pernod takeover talk lifts Domecq Shares in UK...,0


### Splitting the Data into Training and Test Sets 
### with a 20% Test Portion

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(bbc_news, test_size=0.2, random_state=42)

train['div'] = 'train'
test['div'] = 'test'

bbc_news_split = pd.concat([train, test],ignore_index=True)



In [4]:
bbc_news_split.groupby(['div','category']).size().reset_index()

Unnamed: 0,div,category,0
0,test,business,115
1,test,entertainment,72
2,test,politics,76
3,test,sport,102
4,test,tech,80
5,train,business,395
6,train,entertainment,314
7,train,politics,341
8,train,sport,409
9,train,tech,321


In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english')) 

def preprocess_text(text):
    # Remove punctuation using regex
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text) 

    text = ' '.join(text.split())
    # Tokenize words, remove stopwords, and convert back to string
    words = text.split() 
    words = [word for word in words if word not in stop_words]  

    return " ".join(words)   



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
bbc_news_split['preprocessed_text'] = bbc_news_split['text'].apply(preprocess_text)

In [7]:
bbc_news_split['tok'] = bbc_news_split['preprocessed_text'].apply(lambda x: set(x.split()))
	

train_docs = bbc_news_split[bbc_news_split['div']=='train']['tok'].to_numpy() 
dictionary = gensim.corpora.Dictionary(train_docs) 

bbc_news_split['corpus'] = [dictionary.doc2bow(doc) for doc in bbc_news_split['tok'].to_numpy()]
 


In [8]:
from sklearn import metrics
def q_metrics(y_true1, y_pred1):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true1, y_pred1)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('purity_score:',purity)
    print('NMI:',metrics.normalized_mutual_info_score(y_true1, y_pred1))
    
    

### LDA

In [9]:
TOPICS=5

### Optimizing Parameter Selection for Best Model Performance

In [None]:
import time
from sklearn.metrics import classification_report
df = bbc_news_split

def calculate_coherence_score(i, alpha, beta):
    # Measure training time
    start_train = time.time()
    lda_result = LDA(corpus=df[df['div']=='train']['corpus'], id2word=dictionary,
                     iterations=i, num_topics=TOPICS, 
                     chunksize=2000, random_state=42, gamma_threshold=0.001,
                     passes=10, update_every=1,
                     alpha=alpha, eta=beta)
    end_train = time.time()
    training_time = end_train - start_train
    print(f"Training Time: {training_time:.2f} seconds")

    # Measure response time
    start_response = time.time()
    test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
    test_res = lda_result[test_corpus_bow]

    # Get predictions for test set
    pred = []
    for x in test_res:
        x = {k[0]: k[1] for k in x}
        pred.append(max(x, key=x.get))
    end_response = time.time()
    response_time = end_response - start_response
    print(f"Response Time: {response_time:.2f} seconds")

    # Load true labels and calculate metrics
    y_true = df[df['div']=='test']['label']
    y_pred = pred
    q_metrics(y_true, y_pred)

    # Calculate and print coherence score
    cm_lda = CoherenceModel(model=lda_result, dictionary=dictionary, 
                            corpus=df[(df['div']=='train')]['corpus'], 
                            texts=df[df['div']=='train']['tok'].to_numpy(), 
                            coherence='c_v')
    coherence_lda = cm_lda.get_coherence()
    print(f"Coherence Score: {coherence_lda}")

    return training_time, response_time, coherence_lda

# List of various hyperparameters
no_of_iteration = [10, 30]
alpha_list = ['symmetric', 0.4, 0.7]
beta_list = ['auto', 0.4, 0.7]

# Running the parameter grid search with timing
for i in no_of_iteration:
    for alpha in alpha_list:
        for beta in beta_list:
            training_time, response_time, coherence_lda = calculate_coherence_score(i, alpha, beta)
            print(f"i: {i} ; alpha: {alpha} ; beta: {beta}")
            print(f"Training Time: {training_time:.2f} seconds, Response Time: {response_time:.2f} seconds, Coherence Score: {coherence_lda}\n")


Training Time: 17.71 seconds
Response Time: 0.10 seconds
purity_score: 0.7033707865168539
NMI: 0.5497146199720506
Coherence Score: 0.2515690193383192
i: 10 ; alpha: symmetric ; beta: auto
Training Time: 17.71 seconds, Response Time: 0.10 seconds, Coherence Score: 0.2515690193383192

Training Time: 15.63 seconds
Response Time: 0.09 seconds
purity_score: 0.7168539325842697
NMI: 0.5826848693518991
Coherence Score: 0.2879051648279455
i: 10 ; alpha: symmetric ; beta: 0.4
Training Time: 15.63 seconds, Response Time: 0.09 seconds, Coherence Score: 0.2879051648279455

Training Time: 15.89 seconds
Response Time: 0.10 seconds
purity_score: 0.7213483146067415
NMI: 0.5911339409905699
Coherence Score: 0.2966718439488291
i: 10 ; alpha: symmetric ; beta: 0.7
Training Time: 15.89 seconds, Response Time: 0.10 seconds, Coherence Score: 0.2966718439488291

Training Time: 16.12 seconds
Response Time: 0.18 seconds
purity_score: 0.7033707865168539
NMI: 0.5497146199720506
Coherence Score: 0.2554444516387922


### Best params:

 i : 10 ; alpha : 'symmetric' ; beta : 0.7 

purity_score: 0.721\
NMI: 0.591\
coherence_lda: 0.296

### Best Model 

In [10]:
i= 10 ; alpha= 'symmetric' ; beta= 0.7
df = bbc_news_split

lda_result=LDA(corpus=df[(df['div']=='train')]['corpus'], id2word=dictionary,
               iterations=i , num_topics=TOPICS,
               chunksize=2000, random_state=42, gamma_threshold=0.001,
               passes=10, update_every=1,
               alpha=alpha,eta = beta)

test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
test_res = lda_result[test_corpus_bow]

pred_test=[]
for x in test_res:
    x={k[0]:k[1] for k in x}
    pred_test.append(max(x,key=x.get) )

y_true = df[df['div']=='test']['label'] 
y_pred = pred_test
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
cm_lda = CoherenceModel(model=lda_result,
                          dictionary=dictionary, 
                          corpus=df[(df['div']=='train')]['corpus'], 
                          texts=df[df['div']=='train']['tok'].to_numpy(), 
                          coherence='c_v')


coherence_lda = cm_lda.get_coherence()
print('coherence_lda:', coherence_lda)

purity_score: 0.7213483146067415
NMI: 0.5911339409905699
coherence_lda: 0.2301904451522192


In [11]:
train_corpus_bow = df[df['div']=='train']['corpus'].to_numpy()
train_res = lda_result[train_corpus_bow]
pred = []
for x in train_res:
    x = {k[0]: k[1] for k in x}
    pred.append(max(x, key=x.get))

In [12]:
from scipy import stats
  
train_df = pd.DataFrame({
    'topic': pred,
    'label': bbc_news_split[bbc_news_split['div']=='train']['label']
})

train_topic_label_counts = train_df.groupby(['topic', 'label']).size().unstack(fill_value=0)
topic_to_mode_label = train_df.groupby('topic')['label'].agg(lambda x: stats.mode(x)[0])

mapped_test_labels = [topic_to_mode_label.get(topic, None) for topic in pred_test]

test_results_df = pd.DataFrame({
    'predicted_label': mapped_test_labels,
    'true_label':bbc_news_split[bbc_news_split['div']=='test']['label']
})

### Metrics

In [13]:
from sklearn.metrics import classification_report
print(classification_report(test_results_df['true_label'], test_results_df['predicted_label']))

              precision    recall  f1-score   support

           0       0.67      0.80      0.73       115
           1       0.98      0.81      0.89        72
           2       0.51      0.92      0.66        76
           3       0.91      0.99      0.95       102
           4       0.00      0.00      0.00        80

    accuracy                           0.72       445
   macro avg       0.61      0.70      0.64       445
weighted avg       0.63      0.72      0.66       445



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
