Finding the best parameters for the `Newsgroup` dataset was challenging. We tested a wide range of values for the hyperparameters and experimented with several topic numbers. In the notebook below, you’ll see the results for topics 7 and 10, which were among the best and yielded closely competitive outcomes

In [83]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from gensim.models import CoherenceModel
import gensim
from gensim.models.ldamodel import LdaModel as LDA
from sklearn.metrics import normalized_mutual_info_score, f1_score, accuracy_score, confusion_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt

### Load Data

In [84]:
df = pd.read_csv('fetch_7newsgroups.csv')
df.dropna(subset=['label'],inplace = True, ignore_index= True)
df['label'].isna().sum()

0

### Splitting the Data into Training and Test Sets 
### with a 5% Test Portion

In [85]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.05, random_state=42)
train['div'] = 'train'
test['div'] = 'test'

df_newsgroups_split = pd.concat([train, test]).reset_index(drop=True)

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    text = re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '<EMAIL>', text)
    
    text = re.sub(r'\b\d+\b', '', text)
    
    text = re.sub(r"[!\"#$%&'()*+,/:;<=>?@[\\]^_`{|}~]", " ", text)
    
    # Tokenize and selectively remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(words)

In [87]:
import re
df_newsgroups_split['preprocessed_text'] = df_newsgroups_split['text'].apply(preprocess_text)

In [88]:
df_newsgroups_split['tok'] = df_newsgroups_split['preprocessed_text'].apply(lambda x: set(x.split()))  
train_docs = df_newsgroups_split[df_newsgroups_split['div']=='train']['tok'].to_numpy() 
dictionary = gensim.corpora.Dictionary(train_docs) 

df_newsgroups_split['corpus'] = [dictionary.doc2bow(doc) for doc in df_newsgroups_split['tok'].to_numpy()]

In [89]:
from sklearn import metrics
def q_metrics(y_true, y_pred1):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred1)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('purity_score:',purity)
    print('NMI:',metrics.normalized_mutual_info_score(y_true, y_pred1))



### LDA


### Optimizing Parameter Selection for Best Model Performance Topics 7

In [76]:
TOPICS=7

In [77]:
import time
df = df_newsgroups_split

# Define the function with added timing
def calculate_coherence_score(i, alpha, beta):
    # Measure training time
    start_train = time.time()
    lda_result = LDA(corpus=df[df['div']=='train']['corpus'], id2word=dictionary,
                     iterations=i, num_topics=TOPICS, 
                     chunksize=2000, random_state=42, gamma_threshold=0.001,
                     passes=10, update_every=1,
                     alpha=alpha, eta=beta)
    end_train = time.time()
    training_time = end_train - start_train

    # Measure response time
    start_response = time.time()
    test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
    test_res = lda_result[test_corpus_bow]

    # Get predictions for test set
    pred = []
    for x in test_res:
        x = {k[0]: k[1] for k in x}
        pred.append(max(x, key=x.get))
    end_response = time.time()
    response_time = end_response - start_response

    # Load true labels and calculate metrics
    y_true = df[df['div']=='test']['label']
    y_pred = pred
    q_metrics(y_true, y_pred)

    # Calculate and print coherence score
    cm_lda = CoherenceModel(model=lda_result, dictionary=dictionary, 
                            corpus=df[(df['div']=='train')]['corpus'], 
                            texts=df[df['div']=='train']['tok'].to_numpy(), 
                            coherence='c_v')
    coherence_lda = cm_lda.get_coherence()
    print(f"Coherence Score: {coherence_lda}")

    return training_time, response_time, coherence_lda

# List of various hyperparameters
no_of_iteration = [10, 30]
alpha_list = ['symmetric', 0.4, 0.7]
beta_list = ['auto', 0.4, 0.7]

# Running the parameter grid search with timing
for i in no_of_iteration:
    for alpha in alpha_list:
        for beta in beta_list:
            training_time, response_time, coherence_lda = calculate_coherence_score(i, alpha, beta)
            print(f"i: {i} ; alpha: {alpha} ; beta: {beta}")
            print(f"Training Time: {training_time:.2f} seconds, Response Time: {response_time:.2f} seconds, Coherence Score: {coherence_lda}\n")


purity_score: 0.42665173572228443
NMI: 0.3006627233441655
Coherence Score: 0.5310053515309129
i: 10 ; alpha: symmetric ; beta: auto
Training Time: 75.27 seconds, Response Time: 0.25 seconds, Coherence Score: 0.5310053515309129

purity_score: 0.38633818589025753
NMI: 0.3054491138005524
Coherence Score: 0.5645498176533866
i: 10 ; alpha: symmetric ; beta: 0.4
Training Time: 64.84 seconds, Response Time: 0.18 seconds, Coherence Score: 0.5645498176533866

purity_score: 0.24748040313549832
NMI: 0.006474976925318856
Coherence Score: 0.49704369545519406
i: 10 ; alpha: symmetric ; beta: 0.7
Training Time: 61.30 seconds, Response Time: 0.17 seconds, Coherence Score: 0.49704369545519406

purity_score: 0.47928331466965285
NMI: 0.3767308977816532
Coherence Score: 0.4145923169148483
i: 10 ; alpha: 0.4 ; beta: auto
Training Time: 89.03 seconds, Response Time: 0.21 seconds, Coherence Score: 0.4145923169148483

purity_score: 0.3885778275475924
NMI: 0.3092693538513533
Coherence Score: 0.5760583263582043

### Optimizing Parameter Selection for Best Model Performance Topics 10

In [98]:
TOPICS=10

In [99]:
import time
df = df_newsgroups_split

# Define the function with added timing
def calculate_coherence_score(i, alpha, beta):
    # Measure training time
    start_train = time.time()
    lda_result = LDA(corpus=df[df['div']=='train']['corpus'], id2word=dictionary,
                     iterations=i, num_topics=TOPICS, 
                     chunksize=2000, random_state=42, gamma_threshold=0.001,
                     passes=10, update_every=1,
                     alpha=alpha, eta=beta)
    end_train = time.time()
    training_time = end_train - start_train

    # Measure response time
    start_response = time.time()
    test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
    test_res = lda_result[test_corpus_bow]

    # Get predictions for test set
    pred = []
    for x in test_res:
        x = {k[0]: k[1] for k in x}
        pred.append(max(x, key=x.get))
    end_response = time.time()
    response_time = end_response - start_response

    # Load true labels and calculate metrics
    y_true = df[df['div']=='test']['label']
    y_pred = pred
    q_metrics(y_true, y_pred)

    # Calculate and print coherence score
    cm_lda = CoherenceModel(model=lda_result, dictionary=dictionary, 
                            corpus=df[(df['div']=='train')]['corpus'], 
                            texts=df[df['div']=='train']['tok'].to_numpy(), 
                            coherence='c_v')
    coherence_lda = cm_lda.get_coherence()
    print(f"Coherence Score: {coherence_lda}")

    return training_time, response_time, coherence_lda

# List of various hyperparameters
no_of_iteration = [10, 30]
alpha_list = ['symmetric', 0.4, 0.7]
beta_list = ['auto', 0.4, 0.7]

# Running the parameter grid search with timing
for i in no_of_iteration:
    for alpha in alpha_list:
        for beta in beta_list:
            training_time, response_time, coherence_lda = calculate_coherence_score(i, alpha, beta)
            print(f"i: {i} ; alpha: {alpha} ; beta: {beta}")
            print(f"Training Time: {training_time:.2f} seconds, Response Time: {response_time:.2f} seconds, Coherence Score: {coherence_lda}\n")


purity_score: 0.49832026875699886
NMI: 0.38364380043795926
Coherence Score: 0.3569278850799392
i: 10 ; alpha: symmetric ; beta: auto
Training Time: 82.29 seconds, Response Time: 0.26 seconds, Coherence Score: 0.3569278850799392

purity_score: 0.4244120940649496
NMI: 0.33677407434736106
Coherence Score: 0.4738549416604432
i: 10 ; alpha: symmetric ; beta: 0.4
Training Time: 73.53 seconds, Response Time: 0.25 seconds, Coherence Score: 0.4738549416604432

purity_score: 0.3964165733482643
NMI: 0.24623218656545398
Coherence Score: 0.4620343987803395
i: 10 ; alpha: symmetric ; beta: 0.7
Training Time: 70.15 seconds, Response Time: 0.27 seconds, Coherence Score: 0.4620343987803395

purity_score: 0.49496080627099664
NMI: 0.3680480902851045
Coherence Score: 0.4244610080067536
i: 10 ; alpha: 0.4 ; beta: auto
Training Time: 84.59 seconds, Response Time: 0.26 seconds, Coherence Score: 0.4244610080067536

purity_score: 0.42105263157894735
NMI: 0.3301952618330405
Coherence Score: 0.4676875530419857
i

### Best Model with Topic 10 
i= 10 ; alpha= 0.7 ; beta= auto \
purity_score: 0.558 \
NMI: 0.383 \
Coherence Score: 0.459 


In [104]:
i= 10 ; alpha= 0.7 ; beta= 'auto'
TOPICS= 10
# i= 10 ; alpha= 'symmetric' ; beta= 'auto'
lda_result = LDA(corpus=df[df['div']=='train']['corpus'], id2word=dictionary,
                     iterations=i, num_topics=TOPICS, 
                     chunksize=2000, random_state=42, gamma_threshold=0.001,
                     passes=10, update_every=1,
                     alpha=alpha, eta=beta)

test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
test_res = lda_result[test_corpus_bow]

pred_test=[]
for x in test_res:
    x={k[0]:k[1] for k in x}
    pred_test.append(max(x,key=x.get) )

y_true = df[df['div']=='test']['label'] 
y_pred = pred_test
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
cm_lda = CoherenceModel(model=lda_result,
                          dictionary=dictionary, 
                          corpus=df[(df['div']=='train')]['corpus'], 
                          texts=df[df['div']=='train']['tok'].to_numpy(), 
                          coherence='c_v')

coherence_lda = cm_lda.get_coherence()
    
print('coherence_lda:', coherence_lda)

purity_score: 0.5587905935050392
NMI: 0.3837598974432982
coherence_lda: 0.4591859492795353


In [105]:
train_corpus_bow = df[df['div']=='train']['corpus'].to_numpy()
train_res = lda_result[train_corpus_bow]


In [106]:
pred_train = []
for x in train_res:
    x = {k[0]: k[1] for k in x}
    pred_train.append(max(x, key=x.get))

In [None]:
from scipy import stats
  
# Create a DataFrame for training data
train_df = pd.DataFrame({
    'topic': pred_train,
    'label': df_newsgroups_split[df_newsgroups_split['div']=='train']['label']
})

# Group by 'topic' and count how many of each label there are for each topic in the training data
train_topic_label_counts = train_df.groupby(['topic', 'label']).size().unstack(fill_value=0)

# Assign the mode (most frequent label) for each topic in the training data
topic_to_mode_label = train_df.groupby('topic')['label'].agg(lambda x: x.mode().iloc[0])

# Map the predicted test topics to actual labels using the mapping from training data
mapped_test_labels = [topic_to_mode_label.get(topic, None) for topic in pred_test]

# Create a DataFrame to combine mapped test labels and actual test labels
test_results_df = pd.DataFrame({
    'predicted_label': mapped_test_labels,
    'true_label':df_newsgroups_split[df_newsgroups_split['div']=='test']['label']
})

### Metrics

In [108]:
from sklearn.metrics import classification_report

print(classification_report(test_results_df['true_label'],test_results_df['predicted_label']))

               precision    recall  f1-score   support

Miscellaneous       0.00      0.00      0.00        46
     Politics       0.43      0.69      0.53       127
     Religion       0.00      0.00      0.00        66
      Science       0.44      0.46      0.45       208
        Sport       0.90      0.60      0.72       120
   Technology       0.62      0.96      0.76       219
     Vehicles       0.60      0.31      0.41       107

     accuracy                           0.56       893
    macro avg       0.43      0.43      0.41       893
 weighted avg       0.51      0.56      0.51       893



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
