In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from gensim.models import CoherenceModel
import gensim
from gensim.models.ldamodel import LdaModel as LDA
from sklearn.metrics import normalized_mutual_info_score, f1_score, accuracy_score, confusion_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt

In [36]:
# For 20 Newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')

# Convert the newsgroups data into a pandas DataFrame with 'content' and 'category' columns
df_newsgroups_train = pd.DataFrame({
    'content': newsgroups_train.data,    
    'category': newsgroups_train.target  
})

df_labels = df_newsgroups_train['category']  
df_texts = df_newsgroups_train['content']

# Display the first few rows to check the structure
print(df_newsgroups_train.head())

                                             content  category
0  From: lerxst@wam.umd.edu (where's my thing)\nS...         7
1  From: guykuo@carson.u.washington.edu (Guy Kuo)...         4
2  From: twillis@ec.ecn.purdue.edu (Thomas E Will...         4
3  From: jgreen@amber (Joe Green)\nSubject: Re: W...         1
4  From: jcm@head-cfa.harvard.edu (Jonathan McDow...        14


In [3]:
from collections import Counter
Counter(df_labels)

Counter({10: 600,
         15: 599,
         8: 598,
         9: 597,
         11: 595,
         7: 594,
         13: 594,
         14: 593,
         5: 593,
         2: 591,
         12: 591,
         3: 590,
         6: 585,
         1: 584,
         4: 578,
         17: 564,
         16: 546,
         0: 480,
         18: 465,
         19: 377})

In [4]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english')) 

# Define the preprocessing function
def preprocess_text(text):
    # Remove punctuation using regex
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) 
    
    # Tokenize words, remove stopwords, and convert back to string
    words = text.split() 
    words = [word for word in words if word.lower() not in stop_words]  
    
    # Return preprocessed text as a single string
    return " ".join(words)  

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patsias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
df_newsgroups_train

Unnamed: 0,content,category
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [38]:
import re
df_newsgroups_train['preprocessed_text'] = df_newsgroups_train['content'].apply(preprocess_text)

In [39]:
df_newsgroups_train

Unnamed: 0,content,category,preprocessed_text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,lerxstwamumdedu wheres thing subject car nntpp...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,guykuocarsonuwashingtonedu guy kuo subject si ...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,twillisececnpurdueedu thomas e willis subject ...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,jgreenamber joe green subject weitek p9000 org...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,jcmheadcfaharvardedu jonathan mcdowell subject...
...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13,jimzisfeinfactorycom jim zisfein subject migra...
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4,ebodinpearltuftsedu subject screen death mac p...
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3,westesnetcomcom estes subject mounting cpu coo...
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1,stevehcrlgw steven collins subject sphere 4 po...


In [40]:
df_newsgroups_train['tok'] = df_newsgroups_train['preprocessed_text'].apply(lambda x: set(x.split()))  
	

In [41]:
df_newsgroups_train

Unnamed: 0,content,category,preprocessed_text,tok
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,lerxstwamumdedu wheres thing subject car nntpp...,"{years, anyone, name, early, late, 60s, made, ..."
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,guykuocarsonuwashingtonedu guy kuo subject si ...,"{message, shelley1qvfo9innc3s, floppies, two, ..."
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,twillisececnpurdueedu thomas e willis subject ...,"{twillisececnpurdueedu, back, time, purdue, us..."
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,jgreenamber joe green subject weitek p9000 org...,"{anyone, fill, id, article, jgreenamber, like,..."
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,jcmheadcfaharvardedu jonathan mcdowell subject...,"{software, previously, ok, pack, shuttle, clea..."
...,...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13,jimzisfeinfactorycom jim zisfein subject migra...,"{migraine, years, died, try, factorys, never, ..."
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4,ebodinpearltuftsedu subject screen death mac p...,"{logic, blanking, need, 22, physical, jolt, wi..."
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3,westesnetcomcom estes subject mounting cpu coo...,"{pins, work, pl8, back, ensure, tried, case, m..."
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1,stevehcrlgw steven collins subject sphere 4 po...,"{bolsoncarsonuwashingtonedu, 4, edward, far, b..."


In [43]:
newsgroups_test = fetch_20newsgroups(subset='test')
# Convert the newsgroups data into a pandas DataFrame with 'content' and 'category' columns
df_newsgroups_test = pd.DataFrame({
    'content': newsgroups_test.data,    
    'category': newsgroups_test.target  
})

In [52]:
df_newsgroups_test['preprocessed_text'] = df_newsgroups_test['content'].apply(preprocess_text)
df_newsgroups_test['tok'] = df_newsgroups_test['preprocessed_text'].apply(lambda x: set(x.split()))  

In [53]:
from sklearn.model_selection import train_test_split

# Create a new column indicating 'train' or 'test'
df_newsgroups_train['div'] = 'train'
df_newsgroups_test['div'] = 'test'


# Concatenate the two sets back into a single DataFrame
df_newsgroups_split = pd.concat([df_newsgroups_train, df_newsgroups_test])

# Optional: Reset index if needed
df_newsgroups_split = df_newsgroups_split.reset_index(drop=True)


In [54]:
df_newsgroups_split

Unnamed: 0,content,category,preprocessed_text,tok,div
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,lerxstwamumdedu wheres thing subject car nntpp...,"{years, anyone, name, early, late, 60s, made, ...",train
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,guykuocarsonuwashingtonedu guy kuo subject si ...,"{message, shelley1qvfo9innc3s, floppies, two, ...",train
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,twillisececnpurdueedu thomas e willis subject ...,"{twillisececnpurdueedu, back, time, purdue, us...",train
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,jgreenamber joe green subject weitek p9000 org...,"{anyone, fill, id, article, jgreenamber, like,...",train
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,jcmheadcfaharvardedu jonathan mcdowell subject...,"{software, previously, ok, pack, shuttle, clea...",train
...,...,...,...,...,...
18841,From: richmond@spiff.Princeton.EDU (Stupendous...,14,richmondspiffprincetonedu stupendous man subje...,"{jr, try, disk, thermal, richmondspiffprinceto...",test
18842,From: smytonj@murr11.alleg.edu (Jim Smyton)\nS...,4,smytonjmurr11allegedu jim smyton subject monit...,"{columbus, drives, ya, 432101174, nevaimpsohio...",test
18843,From: hhenderson@vax.clarku.edu\nSubject: RE: ...,9,hhendersonvaxclarkuedu subject game length bra...,"{years, opinion, 1115, oh, nichols, arguments,...",test
18844,From: b859zam@utarlg.uta.edu \nSubject: INTEL ...,6,b859zamutarlgutaedu subject intel chmos 808680...,"{work, complete, 4, 8, texas, erasable, offer,...",test


In [55]:
train_docs = df_newsgroups_split[df_newsgroups_split['div']=='train'].tok.to_numpy() 
dictionary = gensim.corpora.Dictionary(train_docs)  

In [56]:
# assume  the dictionary looks like this: {'machine': 0, 'learning': 1, 'data': 2, 'science': 3, 'deep': 4}   The doc2bow() function will convert each document into a list of tuples: [(0, 1), (1, 1), (2, 1)]  # 'machine' appears 1 time, 'learning' appears 1 time, 'data' appears 1 time

bow_corpus = [dictionary.doc2bow(doc) for doc in df_newsgroups_split.tok]

In [57]:

df_newsgroups_split['corpus']=bow_corpus

In [58]:
df_newsgroups_split

Unnamed: 0,content,category,preprocessed_text,tok,div,corpus
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,lerxstwamumdedu wheres thing subject car nntpp...,"{years, anyone, name, early, late, 60s, made, ...",train,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1..."
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,guykuocarsonuwashingtonedu guy kuo subject si ...,"{message, shelley1qvfo9innc3s, floppies, two, ...",train,"[(14, 1), (29, 1), (37, 1), (38, 1), (40, 1), ..."
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,twillisececnpurdueedu thomas e willis subject ...,"{twillisececnpurdueedu, back, time, purdue, us...",train,"[(13, 1), (17, 1), (24, 1), (25, 1), (29, 1), ..."
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,jgreenamber joe green subject weitek p9000 org...,"{anyone, fill, id, article, jgreenamber, like,...",train,"[(5, 1), (25, 1), (29, 1), (37, 1), (38, 1), (..."
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,jcmheadcfaharvardedu jonathan mcdowell subject...,"{software, previously, ok, pack, shuttle, clea...",train,"[(29, 1), (38, 1), (43, 1), (50, 1), (57, 1), ..."
...,...,...,...,...,...,...
18841,From: richmond@spiff.Princeton.EDU (Stupendous...,14,richmondspiffprincetonedu stupendous man subje...,"{jr, try, disk, thermal, richmondspiffprinceto...",test,"[(5, 1), (29, 1), (31, 1), (37, 1), (38, 1), (..."
18842,From: smytonj@murr11.alleg.edu (Jim Smyton)\nS...,4,smytonjmurr11allegedu jim smyton subject monit...,"{columbus, drives, ya, 432101174, nevaimpsohio...",test,"[(12, 1), (14, 1), (25, 1), (38, 1), (50, 1), ..."
18843,From: hhenderson@vax.clarku.edu\nSubject: RE: ...,9,hhendersonvaxclarkuedu subject game length bra...,"{years, opinion, 1115, oh, nichols, arguments,...",test,"[(5, 1), (29, 1), (38, 1), (50, 1), (54, 1), (..."
18844,From: b859zam@utarlg.uta.edu \nSubject: INTEL ...,6,b859zamutarlgutaedu subject intel chmos 808680...,"{work, complete, 4, 8, texas, erasable, offer,...",test,"[(5, 1), (25, 1), (29, 1), (37, 1), (38, 1), (..."


In [59]:
from sklearn import metrics
def q_metrics(y_true, y_pred,my_model=None):
    contigency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    purity = np.sum(np.amax(contigency_matrix, axis=0)) / np.sum(contigency_matrix)
    print('purity_score:',purity)
    print('NMI:',metrics.normalized_mutual_info_score(y_true, y_pred))
    
    if my_model!=None:
        cm = CoherenceModel(model=my_model, corpus=bow_corpus, dictionary=dictionary, coherence='u_mass')
        print('Coherence:',cm.get_coherence())


In [60]:
df_newsgroups_split['label']=df_newsgroups_split.category

In [61]:
df_newsgroups_split

Unnamed: 0,content,category,preprocessed_text,tok,div,corpus,label
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,lerxstwamumdedu wheres thing subject car nntpp...,"{years, anyone, name, early, late, 60s, made, ...",train,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,guykuocarsonuwashingtonedu guy kuo subject si ...,"{message, shelley1qvfo9innc3s, floppies, two, ...",train,"[(14, 1), (29, 1), (37, 1), (38, 1), (40, 1), ...",4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,twillisececnpurdueedu thomas e willis subject ...,"{twillisececnpurdueedu, back, time, purdue, us...",train,"[(13, 1), (17, 1), (24, 1), (25, 1), (29, 1), ...",4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,jgreenamber joe green subject weitek p9000 org...,"{anyone, fill, id, article, jgreenamber, like,...",train,"[(5, 1), (25, 1), (29, 1), (37, 1), (38, 1), (...",1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,jcmheadcfaharvardedu jonathan mcdowell subject...,"{software, previously, ok, pack, shuttle, clea...",train,"[(29, 1), (38, 1), (43, 1), (50, 1), (57, 1), ...",14
...,...,...,...,...,...,...,...
18841,From: richmond@spiff.Princeton.EDU (Stupendous...,14,richmondspiffprincetonedu stupendous man subje...,"{jr, try, disk, thermal, richmondspiffprinceto...",test,"[(5, 1), (29, 1), (31, 1), (37, 1), (38, 1), (...",14
18842,From: smytonj@murr11.alleg.edu (Jim Smyton)\nS...,4,smytonjmurr11allegedu jim smyton subject monit...,"{columbus, drives, ya, 432101174, nevaimpsohio...",test,"[(12, 1), (14, 1), (25, 1), (38, 1), (50, 1), ...",4
18843,From: hhenderson@vax.clarku.edu\nSubject: RE: ...,9,hhendersonvaxclarkuedu subject game length bra...,"{years, opinion, 1115, oh, nichols, arguments,...",test,"[(5, 1), (29, 1), (38, 1), (50, 1), (54, 1), (...",9
18844,From: b859zam@utarlg.uta.edu \nSubject: INTEL ...,6,b859zamutarlgutaedu subject intel chmos 808680...,"{work, complete, 4, 8, texas, erasable, offer,...",test,"[(5, 1), (25, 1), (29, 1), (37, 1), (38, 1), (...",6


In [62]:

from sklearn.metrics import classification_report
df = df_newsgroups_split
TOPICS=20
def calculate_coherence_score(i, alpha, beta):
    lda_result=LDA(corpus=df[df['div']=='train']['corpus'], id2word=dictionary,
               iterations=i , num_topics=TOPICS, 
               chunksize=2000, random_state=42, gamma_threshold=0.001,
               passes=10, update_every=1,
               alpha=alpha,eta = beta)

    test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
    test_res = lda_result[test_corpus_bow]

    pred=[]
    for x in test_res:
        x={k[0]:k[1] for k in x}
        pred.append(max(x,key=x.get) )

    y_true = df[df['div']=='test']['label'] 
    y_pred = pred
    q_metrics(y_true, y_pred)    


    cm_lda = CoherenceModel(model=lda_result,
                          dictionary=dictionary, 
                          corpus=df[(df['div']=='train')]['corpus'], 
                          texts=df[df['div']=='train']['tok'].to_numpy(), 
                          coherence='c_v')

    # get coherence value
    coherence_lda = cm_lda.get_coherence()
        
    print('coherence_lda:', coherence_lda)
    return coherence_lda





#list containing various hyperparameters
no_of_iteration = [10,30]
alpha_list = ['symmetric',0.4,0.7]
beta_list = ['auto',0.4,0.7]


for i in no_of_iteration:
    for alpha in alpha_list:
        for beta in beta_list:
            calculate_coherence_score(i, alpha, beta)  
            print(f"i : {i} ; alpha : {alpha} ; beta : {beta} ")


purity_score: 0.25385023898035053
NMI: 0.3392709871162657
coherence_lda: 0.4884788217879974
i : 10 ; alpha : symmetric ; beta : auto 
purity_score: 0.11590546999468933
NMI: 0.21424102037824996
coherence_lda: 0.4684466747002494
i : 10 ; alpha : symmetric ; beta : 0.4 
purity_score: 0.07448220924057355
NMI: 0.05164487239456725
coherence_lda: 0.5910782406783626
i : 10 ; alpha : symmetric ; beta : 0.7 
purity_score: 0.275092936802974
NMI: 0.3390756767469277
coherence_lda: 0.46311325452506064
i : 10 ; alpha : 0.4 ; beta : auto 
purity_score: 0.1618428040361126
NMI: 0.35186448816828364
coherence_lda: 0.505307452073972
i : 10 ; alpha : 0.4 ; beta : 0.4 
purity_score: 0.09227296866702071
NMI: 0.11127945207276277
coherence_lda: 0.572847260675032
i : 10 ; alpha : 0.4 ; beta : 0.7 
purity_score: 0.30151354221986193
NMI: 0.3696974448692568
coherence_lda: 0.41669291488330346
i : 10 ; alpha : 0.7 ; beta : auto 
purity_score: 0.17100371747211895
NMI: 0.3609512340780412
coherence_lda: 0.51823346407577

In [64]:
# i : 10 ; alpha : 'symmetric' ; beta : 0.4 
lda_result=LDA(corpus=df[(df['div']=='train')]['corpus'], id2word=dictionary,
               iterations=10 , num_topics=TOPICS,
               chunksize=2000, random_state=42, gamma_threshold=0.001,
               passes=10, update_every=1,
               alpha= 0.7 , eta = 'auto')

test_corpus_bow = df[df['div']=='test']['corpus'].to_numpy()
test_res = lda_result[test_corpus_bow]

pred=[]
for x in test_res:
    x={k[0]:k[1] for k in x}
    pred.append(max(x,key=x.get) )

y_true = df[df['div']=='test']['label'] 
y_pred = pred
q_metrics(y_true, y_pred)


# evaluate model using Topic Coherence score
cm_lda = CoherenceModel(model=lda_result,
                          dictionary=dictionary, 
                          corpus=df[(df['div']=='train')]['corpus'], 
                          texts=df[df['div']=='train']['tok'].to_numpy(), 
                          coherence='c_v')

# get coherence value
coherence_lda = cm_lda.get_coherence()
    
print('coherence_lda:', coherence_lda)

purity_score: 0.30151354221986193
NMI: 0.3696974448692568
coherence_lda: 0.41669291488330346


In [66]:
pred_test=[]
for x in test_res:
    x={k[0]:k[1] for k in x}
    pred_test.append(max(x,key=x.get) )
    
temp = pd.DataFrame()
temp['y_true'] = y_true
temp['y_pred'] = pred_test

for i in range(TOPICS):
    print(i,'\t',Counter(temp[temp['y_pred']==i]['y_true']))

0 	 Counter({4: 116, 3: 115, 6: 14, 12: 11, 2: 7, 8: 5, 1: 4, 7: 2, 5: 1, 13: 1})
1 	 Counter({8: 6, 19: 4, 4: 3, 17: 2, 12: 1, 5: 1, 9: 1})
2 	 Counter({14: 18, 11: 4, 19: 2, 0: 2, 8: 2, 3: 1, 17: 1, 13: 1, 16: 1, 5: 1, 12: 1, 2: 1, 6: 1})
3 	 Counter({14: 44, 6: 11, 11: 5, 3: 4, 1: 4, 4: 3, 7: 2, 19: 2, 16: 1, 9: 1, 10: 1, 15: 1, 8: 1, 0: 1})
4 	 Counter({8: 5, 0: 3, 13: 1})
5 	 Counter({10: 133, 9: 36, 6: 7, 5: 2, 14: 1})
6 	 Counter({18: 38, 13: 24, 15: 7, 8: 1, 11: 1})
7 	 Counter({0: 8, 13: 7, 12: 6, 8: 5, 18: 3, 14: 3, 6: 2, 5: 1, 11: 1, 9: 1, 7: 1, 15: 1, 19: 1, 17: 1, 4: 1})
8 	 Counter({11: 196, 14: 92, 13: 66, 1: 49, 12: 41, 18: 25, 5: 19, 16: 15, 2: 12, 4: 9, 0: 7, 3: 7, 6: 6, 8: 5, 19: 4, 7: 4, 15: 3, 17: 3, 10: 1})
9 	 Counter({11: 30, 17: 5, 13: 2})
10 	 Counter({7: 318, 8: 263, 12: 148, 6: 87, 14: 64, 13: 26, 4: 22, 16: 15, 3: 14, 11: 8, 18: 4, 9: 4, 2: 3, 0: 2, 1: 2, 19: 2, 10: 1, 15: 1})
11 	 Counter({16: 50, 18: 18, 6: 10, 13: 9, 7: 8, 8: 8, 9: 5, 4: 5, 5: 3, 12: 2, 

In [67]:
topic_words=[]
for i in range(TOPICS):
    tt = lda_result.get_topic_terms(i,20)
    topic_words.append([dictionary[pair[0]] for pair in tt])
df_topwords=pd.DataFrame(topic_words)
# df_topwords=df_topwords.T
df_topwords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,drive,lines,subject,organization,one,memory,mac,writes,system,hard,card,speed,apple,board,disk,ram,dont,use,bus,drives
1,apr,1993,message,gmt,93,inreplyto,lines,subject,organization,nntppostinghost,date,robert,19,16,14,15,mon,brian,fri,research
2,nntppostinghost,subject,organization,lines,corp,expressed,opinions,newssoftware,communications,vaxvms,vnews,141,pat,express,usa,keywords,writes,access,lab,article
3,organization,lines,subject,space,university,distribution,nasa,earth,science,engineering,pa,project,orbit,sci,sciences,mellon,carnegie,flight,station,denver
4,ed,microsystems,nc,wisconsin,sun,edward,david,wi,grateful,sphere,russell,egreeneastsuncom,dead,green,former,circle,whoever,nye,milwaukee,clinic
5,team,lines,subject,organization,game,games,university,4,3,1,play,7,6,hockey,5,season,teams,2,players,canada
6,subject,organization,lines,article,writes,univ,science,medical,soon,replyto,computer,gordon,disease,banks,pittsburgh,medicine,surrender,doctor,new,gebcspittedu
7,lines,subject,organization,institute,technology,writes,nntppostinghost,california,keith,article,guest,online,services,jon,michael,408,netcom,georgia,communication,allan
8,system,use,information,public,available,data,using,key,number,may,used,provide,us,two,order,large,also,time,systems,note
9,na,chip,lines,clipper,subject,distribution,organization,writes,encryption,government,white,article,house,secret,va,key,crypto,phone,algorithm,code


In [68]:
topic_name={0:4,1:8,2:14,3:14,4:8,5:10,6:18,7:0,8:11,9:11,10:7,11:16,12:15,13:1,14:14,15:17,17:9,18:14,19:5}

y_true = df[df['div']=='test']['label'].to_list()
y_pred = [*map(topic_name.get, pred)]

q_metrics(y_true, pred)
print(classification_report(y_true,y_pred))

purity_score: 0.30151354221986193
NMI: 0.3696974448692568
              precision    recall  f1-score   support

           0       0.19      0.03      0.04       319
           1       0.25      0.01      0.01       389
           2       0.00      0.00      0.00       394
           3       0.00      0.00      0.00       392
           4       0.42      0.30      0.35       385
           5       0.17      0.91      0.29       395
           6       0.00      0.00      0.00       390
           7       0.32      0.80      0.46       396
           8       0.41      0.03      0.05       398
           9       0.28      0.76      0.41       397
          10       0.74      0.34      0.46       399
          11       0.37      0.57      0.45       396
          12       0.00      0.00      0.00       393
          13       0.00      0.00      0.00       396
          14       0.53      0.18      0.27       394
          15       0.27      0.91      0.42       398
          16       0.38

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
