Determining the best parameters for the Newsgroup dataset was challenging. We explored a variety of hyperparameter values and experimented with different topic counts. In the notebook below, youâ€™ll find results for topics 7 and 10, which performed well and showed closely comparable results

In [16]:
import os
import re
import json
import pandas as pd
import gensim
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string
from octis.models.NMF import NMF
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.coherence_metrics import Coherence
from sklearn import metrics
import numpy as np

### Load Data

In [17]:
df = pd.read_csv('fetch_7newsgroups.csv')
df.dropna(subset=['label'],inplace = True, ignore_index= True)
df['label'].isna().sum()

0

### Splitting the Data into Training and Test Sets 
### with a 5% Test Portion

In [18]:
# Split the dataset into training and test sets
train, test = train_test_split(df, test_size=0.05, random_state=42)
train['div'] = 'train'
test['div'] = 'test'

group_news_split = pd.concat([train, test]).reset_index(drop=True)

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Ensure the necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove links and email addresses
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', ' ', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)

    words = text.split()
    words = [word for word in words if word not in stop_words]

    words = [lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/patsias/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
group_news_split['preprocessed_text'] = group_news_split['text'].apply(preprocess_text)

In [None]:
group_news_split['tok'] = group_news_split['preprocessed_text'].apply(lambda x: set(x.split()))

train_docs = group_news_split[group_news_split['div']=='train']['tok'].to_numpy()
dictionary = gensim.corpora.Dictionary(train_docs)

In [22]:

os.makedirs('newgroups_octis', exist_ok=True)

# Save the vocabulary (for OCTIS)
vocab_length = len(dictionary)
with open("newgroups_octis/vocabulary.txt", "w", encoding='utf8') as f:
    for i in range(vocab_length):
        f.write(dictionary[i] + '\n')

# Create Bag-of-Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc) for doc in group_news_split['tok']]
group_news_split['corpus'] = bow_corpus


# Save the corpus as 'corpus.tsv'
group_news_split[['preprocessed_text', 'div', 'label']].to_csv("newgroups_octis/corpus.tsv", sep='\t', index=False, header=False)

# Create metadata.json for categories
# Extract labels from your dataframe
labels = group_news_split['label'].tolist()

# Write labels to metadata.json
metadata = {"labels": labels}
with open("newgroups_octis/metadata.json", "w") as outfile:
    json.dump(metadata, outfile)

print("metadata.json has been successfully created.")

metadata.json has been successfully created.


In [23]:
from sklearn import metrics
def q_metrics(y_true1, y_pred1):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true1, y_pred1)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('purity_score:',purity)
    print('NMI:',metrics.normalized_mutual_info_score(y_true1, y_pred1))

### NMF

### Optimizing Parameter Selection for Best Model Performance Topics 7 & 10

In [None]:
import time
from octis.dataset.dataset import Dataset
from octis.models.NMF import NMF
from octis.evaluation_metrics.coherence_metrics import Coherence

# Model Training and Evaluation Function
def calculate_coherence_score(num_topics, kappa, w_max_iter, h_max_iter):
    model = NMF(num_topics=num_topics, chunksize=2000, passes=10, kappa=kappa,
                minimum_probability=0.01, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter=h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True, random_state=42)

    # Load dataset
    newgroups_dataset = Dataset()
    newgroups_dataset.load_custom_dataset_from_folder('newgroups_octis')

    # Measure training time
    start_train = time.time()
    nmf_output = model.train_model(newgroups_dataset)
    end_train = time.time()
    training_time = end_train - start_train

    # Measure response time
    start_response = time.time()
    test_res = nmf_output['test-topic-document-matrix'].T
    pred = [np.argmax(res) for res in test_res]
    end_response = time.time()
    response_time = end_response - start_response

    # Load true labels for evaluation
    df = pd.read_csv("newgroups_octis/corpus.tsv", sep='\t', header=None)
    y_true = df[df[1] == 'test'][2].values  

    # Evaluate metrics
    q_metrics(y_true, pred) 

    # Calculate Coherence
    coherence = Coherence(texts=newgroups_dataset.get_corpus(), topk=10, measure='c_v')
    coherence_score = coherence.score(nmf_output)
    print(f"Coherence Score: {coherence_score}")

    return training_time, response_time, coherence_score


# Hyperparameter combinations
kappa = [0.1, 0.5, 1.0]
w_max_iter = [100, 150, 200]
h_max_iter = [100, 150, 200]
num_topics_list = [7, 10]  

# Run and measure each configuration for each topic count
for num_topics in num_topics_list:
    for w in w_max_iter:
        for h in h_max_iter:
            for k in kappa:
                training_time, response_time, coherence_score = calculate_coherence_score(
                    num_topics=num_topics, kappa=k, w_max_iter=w, h_max_iter=h
                )
                print(f"num_topics: {num_topics}, w_max_iter: {w}, kappa: {k}, h_max_iter: {h}")
                print(f"Training Time: {training_time:.2f} seconds, Response Time: {response_time:.2f} seconds, Coherence Score: {coherence_score}\n")


purity_score: 0.3505039193729003
NMI: 0.14898979705851056
Coherence Score: 0.746767365363229
num_topics: 7, w_max_iter: 100, kappa: 0.1, h_max_iter: 100
Training Time: 57.28 seconds, Response Time: 0.00 seconds, Coherence Score: 0.746767365363229

purity_score: 0.2777155655095185
NMI: 0.07089776329915427
Coherence Score: 0.7280642758056081
num_topics: 7, w_max_iter: 100, kappa: 0.5, h_max_iter: 100
Training Time: 58.31 seconds, Response Time: 0.00 seconds, Coherence Score: 0.7280642758056081

purity_score: 0.3594624860022396
NMI: 0.16457369623299933
Coherence Score: 0.7529301994069479
num_topics: 7, w_max_iter: 100, kappa: 1.0, h_max_iter: 100
Training Time: 52.72 seconds, Response Time: 0.00 seconds, Coherence Score: 0.7529301994069479

purity_score: 0.3505039193729003
NMI: 0.14898979705851056
Coherence Score: 0.746767365363229
num_topics: 7, w_max_iter: 100, kappa: 0.1, h_max_iter: 150
Training Time: 53.58 seconds, Response Time: 0.00 seconds, Coherence Score: 0.746767365363229

puri

### Best Model 
Topics = 7 \
purity_score: 0.370 \
NMI: 0.161 \
Coherence Score: 0.775

w_max_iter =  200, kappa =  0.1, h_max_iter = 100 

In [32]:
w_max_iter= 200; kappa= 0.1; h_max_iter= 100
model = NMF(num_topics=7, chunksize=2000, passes=10, kappa=kappa,
                minimum_probability=0.01, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter=h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True, random_state=42)

newgroups_dataset = Dataset()
newgroups_dataset.load_custom_dataset_from_folder('newgroups_octis')
nmf_output = model.train_model(newgroups_dataset)

test_res = nmf_output['test-topic-document-matrix'].T

pred_test = [np.argmax(res) for res in test_res]

# Load true labels from the dataset
df = pd.read_csv("newgroups_octis/corpus.tsv", sep='\t', header=None)
y_true = df[df[1] == 'test'][2].values  

y_pred = pred_test
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
coherence = Coherence(texts=newgroups_dataset.get_corpus(), topk=10, measure='c_v')
coherence_score = coherence.score(nmf_output)
print(f"Coherence Score: {coherence_score}")

purity_score: 0.3706606942889138
NMI: 0.16099259248539854
Coherence Score: 0.7750548282706351


In [33]:
train_res = nmf_output['topic-document-matrix'].T
pred = [np.argmax(res) for res in train_res]

In [34]:
from scipy import stats
  
# Create a DataFrame for training data
train_df = pd.DataFrame({
    'topic': pred,
    'label': group_news_split[group_news_split['div']=='train']['label']
})

# Group by 'topic' and count how many of each label there are for each topic in the training data
train_topic_label_counts = train_df.groupby(['topic', 'label']).size().unstack(fill_value=0)

# Assign the mode (most frequent label) for each topic in the training data
topic_to_mode_label = train_df.groupby('topic')['label'].agg(lambda x: x.mode().iloc[0])

# Map the predicted test topics to actual labels using the mapping from training data
mapped_test_labels = [topic_to_mode_label.get(topic, None) for topic in pred_test]

# Create a DataFrame to combine mapped test labels and actual test labels
test_results_df = pd.DataFrame({
    'predicted_label': mapped_test_labels,
    'true_label':group_news_split[group_news_split['div']=='test']['label']
})

In [35]:
from sklearn.metrics import classification_report
print(classification_report(test_results_df['predicted_label'], test_results_df['true_label']))

               precision    recall  f1-score   support

Miscellaneous       0.00      0.00      0.00         0
     Politics       0.00      0.00      0.00         0
     Religion       0.00      0.00      0.00         0
      Science       0.86      0.27      0.41       669
        Sport       0.00      0.00      0.00         0
   Technology       0.69      0.68      0.69       224
     Vehicles       0.00      0.00      0.00         0

     accuracy                           0.37       893
    macro avg       0.22      0.13      0.16       893
 weighted avg       0.82      0.37      0.48       893



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
