In [2]:
import os
import re
import json
import pandas as pd
import gensim
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string
from octis.models.NMF import NMF
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.coherence_metrics import Coherence
from sklearn import metrics
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [3]:
# For 20 Newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')

# Convert the newsgroups data into a pandas DataFrame with 'content' and 'category' columns
df_newsgroups_train = pd.DataFrame({
    'content': newsgroups_train.data,    
    'category': newsgroups_train.target  
})


df_labels = df_newsgroups_train['category']  
df_texts = df_newsgroups_train['content']

# Display the first few rows to check the structure
print(df_newsgroups_train.head())

                                             content  category
0  From: lerxst@wam.umd.edu (where's my thing)\nS...         7
1  From: guykuo@carson.u.washington.edu (Guy Kuo)...         4
2  From: twillis@ec.ecn.purdue.edu (Thomas E Will...         4
3  From: jgreen@amber (Joe Green)\nSubject: Re: W...         1
4  From: jcm@head-cfa.harvard.edu (Jonathan McDow...        14


In [4]:
# For 20 Newsgroups dataset
newsgroups_test = fetch_20newsgroups(subset='test')

# Convert the newsgroups data into a pandas DataFrame with 'content' and 'category' columns
df_newsgroups_test = pd.DataFrame({
    'content': newsgroups_train.data,    
    'category': newsgroups_train.target  
})

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english')) 

# Define the preprocessing function
def preprocess_text(text):
    # Remove punctuation using regex
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) 
    
    # Tokenize words, remove stopwords, and convert back to string
    words = text.split() 
    words = [word for word in words if word.lower() not in stop_words]  
    
    # Return preprocessed text as a single string
    return " ".join(words)  

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Apply the preprocessing function
df_newsgroups_train['processed_text'] = df_newsgroups_train['content'].apply(preprocess_text)

# Tokenize the text into sets of words
df_newsgroups_train['tok'] = df_newsgroups_train['processed_text'].apply(lambda x: set(x.split()))

In [7]:
# Apply the preprocessing function
df_newsgroups_test['processed_text'] = df_newsgroups_test['content'].apply(preprocess_text)

# Tokenize the text into sets of words
df_newsgroups_test['tok'] = df_newsgroups_test['processed_text'].apply(lambda x: set(x.split()))

In [8]:
df_newsgroups_train['div'] = 'train'
df_newsgroups_test['div'] = 'test'

# Merge back train and test into one dataframe
group_news_split = pd.concat([df_newsgroups_train, df_newsgroups_test]).reset_index(drop=True)

# Create a Gensim dictionary for tokenized words
train_docs = group_news_split[group_news_split['div'] == 'train'].tok.to_numpy()
dictionary = gensim.corpora.Dictionary(train_docs)

In [9]:

os.makedirs('newgroups_octis', exist_ok=True)

# Save the vocabulary (for OCTIS)
vocab_length = len(dictionary)
with open("newgroups_octis/vocabulary.txt", "w", encoding='utf8') as f:
    for i in range(vocab_length):
        f.write(dictionary[i] + '\n')

# Create Bag-of-Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc) for doc in group_news_split['tok']]
group_news_split['corpus'] = bow_corpus

# Save the corpus as 'corpus.tsv'
group_news_split[['processed_text', 'div', 'category']].to_csv("newgroups_octis/corpus.tsv", sep='\t', index=False, header=False)

# Create metadata.json for categories
# Extract labels from your dataframe
labels = group_news_split['category'].tolist()

# Write labels to metadata.json
metadata = {"labels": labels}
with open("newgroups_octis/metadata.json", "w") as outfile:
    json.dump(metadata, outfile)

print("metadata.json has been successfully created.")

metadata.json has been successfully created.


In [10]:
# Metrics for purity and normalized mutual information (NMI)
def q_metrics(y_true, y_pred):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('Purity Score:', purity)
    print('NMI:', metrics.normalized_mutual_info_score(y_true, y_pred))

In [12]:


# Load the dataset prepared for OCTIS
newgroups_dataset = Dataset()
newgroups_dataset.load_custom_dataset_from_folder('newgroups_octis')

# Define and train NMF model
def calculate_coherence_score(num_topics, kappa, w_max_iter, h_max_iter):
    model = NMF(num_topics=num_topics, chunksize=2000, passes=10, kappa=kappa,
                minimum_probability=0.01, w_max_iter=w_max_iter,
                w_stop_condition=0.0001, h_max_iter=h_max_iter, h_stop_condition=0.001,
                eval_every=10, normalize=True)

    # Train the model
    nmf_output = model.train_model(newgroups_dataset)

    # Get the test set results (document-topic matrix)
    test_res = nmf_output['test-topic-document-matrix'].T
    pred = [np.argmax(res) for res in test_res]

    # Load true labels from the dataset
    df = pd.read_csv("newgroups_octis/corpus.tsv", sep='\t', header=None)
    y_true = df[df[1] == 'test'][2].values 

    # Evaluate metrics
    q_metrics(y_true, pred)

    # # Calculate Coherence
    coherence = Coherence(texts=newgroups_dataset.get_corpus(), topk=10, measure='c_v')
    coherence_score = coherence.score(nmf_output)
    print(f"Coherence Score: {coherence_score}")

    return coherence_score

#list containing various hyperparameters
kappa = [1.0, 2.0,3.0]
w_max_iter = [50, 100, 200]
h_max_iter = [50, 100, 200]


for w in w_max_iter:
    for h in h_max_iter:
        for k in kappa:
            calculate_coherence_score(num_topics=20, kappa=k, w_max_iter=w, h_max_iter=h)   
            print(f"w_max_iter : {w} ; kappa : {k} ; h_max_iter : {h} ")


Purity Score: 0.1789817924695068
NMI: 0.16567547968480673
Coherence Score: 0.6193236452839318
w_max_iter : 50 ; kappa : 1.0 ; h_max_iter : 50 
Purity Score: 0.10544458193388721
NMI: 0.04972415962855697
Coherence Score: 0.6745162259631792
w_max_iter : 50 ; kappa : 2.0 ; h_max_iter : 50 
Purity Score: 0.07274173590242178
NMI: 0.011530095253093928
Coherence Score: 0.6411036142308827
w_max_iter : 50 ; kappa : 3.0 ; h_max_iter : 50 
Purity Score: 0.10995227152200814
NMI: 0.09426075586114431
Coherence Score: 0.6798130037164912
w_max_iter : 50 ; kappa : 1.0 ; h_max_iter : 100 
Purity Score: 0.09483825349124977
NMI: 0.03194744875526283
Coherence Score: 0.6731224063858837
w_max_iter : 50 ; kappa : 2.0 ; h_max_iter : 100 
Purity Score: 0.0751281598020152
NMI: 0.013003094267862699
Coherence Score: 0.6075570114700269
w_max_iter : 50 ; kappa : 3.0 ; h_max_iter : 100 
Purity Score: 0.1750928053738731
NMI: 0.15260730264775357
Coherence Score: 0.6309174890089836
w_max_iter : 50 ; kappa : 1.0 ; h_max_i

In [23]:
# w_max_iter : 10 ; kappa : 1.0 ; h_max_iter : 20
model=NMF(num_topics=20, chunksize=2000, passes=10, kappa=3.0,
                minimum_probability=0.01, w_max_iter=100,
                w_stop_condition=0.0001, h_max_iter=200, h_stop_condition=0.001,
                eval_every=10, normalize=True)

newgroups = Dataset()
newgroups.load_custom_dataset_from_folder('newgroups_octis')
nmf_output = model.train_model(newgroups)

test_res = nmf_output['test-topic-document-matrix'].T
pred = [np.argmax(res) for res in test_res]

# Load true labels from the dataset
df = pd.read_csv("newgroups_octis/corpus.tsv", sep='\t', header=None)
y_true = df[df[1] == 'test'][2].values  

y_pred = pred
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
coherence = Coherence(texts=newgroups_dataset.get_corpus(), topk=10, measure='c_v')
coherence_score = coherence.score(nmf_output)
print(f"Coherence Score: {coherence_score}")


Purity Score: 0.07795651405338519
NMI: 0.014185867877603306
Coherence Score: 0.6520518553728814


In [24]:
from collections import Counter
test_res = nmf_output['test-topic-document-matrix'].T
pred = [np.argmax(res) for res in test_res]
    
temp = pd.DataFrame()
temp['y_true'] = y_true
temp['y_pred'] = pred
for i in range(20):
    print(i,'\t',Counter(temp[temp['y_pred']==i]['y_true']))

0 	 Counter({9: 13, 15: 12, 7: 12, 4: 12, 11: 12, 0: 11, 5: 10, 17: 10, 19: 10, 14: 9, 16: 9, 12: 8, 6: 8, 1: 8, 10: 7, 3: 7, 18: 7, 2: 6, 8: 5, 13: 5})
1 	 Counter({12: 102, 10: 99, 3: 95, 7: 94, 9: 91, 6: 91, 17: 90, 14: 86, 8: 85, 0: 84, 13: 83, 16: 80, 15: 79, 18: 78, 11: 74, 1: 72, 4: 71, 2: 71, 5: 61, 19: 48})
2 	 Counter({5: 19, 8: 9, 9: 8, 15: 7, 1: 7, 18: 6, 11: 5, 2: 5, 12: 4, 6: 4, 0: 4, 7: 3, 4: 3, 17: 3, 10: 3, 14: 3, 3: 3, 19: 3, 13: 2, 16: 1})
3 	 Counter({14: 11, 16: 10, 4: 6, 8: 6, 13: 6, 10: 6, 7: 5, 15: 5, 17: 5, 2: 5, 12: 4, 1: 4, 19: 4, 3: 4, 5: 3, 9: 3, 6: 3, 11: 3, 18: 3, 0: 2})
4 	 Counter({8: 111, 17: 99, 13: 98, 16: 98, 11: 97, 15: 95, 2: 94, 3: 92, 9: 88, 0: 88, 12: 88, 4: 87, 6: 84, 14: 79, 5: 78, 7: 76, 10: 74, 1: 74, 18: 67, 19: 61})
5 	 Counter({12: 61, 2: 41, 6: 40, 11: 36, 10: 35, 4: 34, 5: 34, 9: 30, 8: 30, 7: 29, 14: 28, 3: 28, 1: 27, 17: 25, 15: 24, 18: 22, 13: 22, 0: 21, 19: 19, 16: 9})
6 	 Counter({16: 3, 11: 3, 17: 2, 13: 2, 0: 2, 12: 2, 15: 2, 9:

In [25]:
df

Unnamed: 0,0,1,2
0,lerxstwamumdedu wheres thing subject car nntpp...,train,7
1,guykuocarsonuwashingtonedu guy kuo subject si ...,train,4
2,twillisececnpurdueedu thomas e willis subject ...,train,4
3,jgreenamber joe green subject weitek p9000 org...,train,1
4,jcmheadcfaharvardedu jonathan mcdowell subject...,train,14
...,...,...,...
22623,jimzisfeinfactorycom jim zisfein subject migra...,test,13
22624,ebodinpearltuftsedu subject screen death mac p...,test,4
22625,westesnetcomcom estes subject mounting cpu coo...,test,3
22626,stevehcrlgw steven collins subject sphere 4 po...,test,1


In [27]:
from sklearn.metrics import classification_report
topic_name={0:9,1:12,2:5,3:14,4:8,5:12,6:16,7:10,8:17,9:2,10:6,11:7,12:5,13:10,14:2,15:15,16:4,17:5,18:5,19:8}

y_true = df[df[1]=='test'][2].to_list()
y_pred = [*map(topic_name.get, pred)]

q_metrics(y_true, pred)
print(classification_report(y_true,y_pred))

Purity Score: 0.07795651405338519
NMI: 0.014185867877603306
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       480
           1       0.00      0.00      0.00       584
           2       0.11      0.15      0.13       591
           3       0.00      0.00      0.00       590
           4       0.13      0.03      0.05       578
           5       0.11      0.21      0.15       593
           6       0.11      0.02      0.03       585
           7       0.10      0.04      0.05       594
           8       0.06      0.42      0.11       598
           9       0.07      0.02      0.03       597
          10       0.11      0.05      0.07       600
          11       0.00      0.00      0.00       595
          12       0.07      0.28      0.12       591
          13       0.00      0.00      0.00       594
          14       0.11      0.02      0.03       593
          15       0.06      0.24      0.10       599
          16       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
