In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space', 'sci.med','sci.space','soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)

In [27]:
newsgroups_train

 'data': ['From: schumach@convex.com (Richard A. Schumacher)\nSubject: Re: DC-X update???\nNntp-Posting-Host: starman.convex.com\nOrganization: CONVEX Computer Corporation, Richardson, Tx., USA\nX-Disclaimer: This message was written by a user at CONVEX Computer\n              Corp. The opinions expressed are those of the user and\n              not necessarily those of CONVEX.\nLines: 32\n\nIn <1993Apr15.234154.23145@iti.org> aws@iti.org (Allen W. Sherzer) writes:\n\n>As for the future, there is at least $5M in next years budget for work\n>on SSRT. They (SDIO) have been looking for more funds and do seem to have\n>some. However, SDIO is not (I repeat, is not) going to fund an orbital\n>prototype. The best we can hope from them is to 1) keep it alive for\n>another year, and 2) fund a suborbital vehicle which MIGHT (with\n>major modifications) just make orbit. There is also some money for a\n>set of prototype tanks and projects to answer a few more open questions.\n\nWould the sub-orbit

In [28]:
print(len(list(newsgroups_train.target_names)))

7


In [29]:
len(newsgroups_train.filenames)

3227

DATA PREPROCESSING:

In [30]:
corpus = []
lem = WordNetLemmatizer()
for i in range(0,len(newsgroups_train.filenames)):
    review = newsgroups_train.data[i].split()
    review = [lem.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

TRAIN AND TEST THE MODEL

In [31]:
vectorizer = CountVectorizer(max_features=5000)
x_counts = vectorizer.fit_transform(corpus)

In [33]:
transformer = TfidfTransformer()
x_tfidf = transformer.fit_transform(x_counts)

In [34]:
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [35]:
#topics
num_topics=7
#NMF model.
model = NMF(n_components=num_topics, init='nndsvd')
#fit the model
model.fit(xtfidf_norm)

NMF(init='nndsvd', n_components=7)

In [36]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-n_top_words - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    
    return pd.DataFrame(word_dict)

In [37]:
get_nmf_topics(model, 30)



Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07
0,god,pitt,edu,___,petch,nasa,keith
1,the,geb,university,__,grass,space,caltech
2,edu,gordon,thanks,uni,valley,access,sgi
3,com,banks,subject,de,chuck,gov,livesey
4,it,cs,from,baalke,tek,digex,edu
5,one,edu,file,____,daily,henry,solntze
6,people,pittsburgh,lines,polygon,verse,egalon,wpd
7,jesus,cadre,organization,reduction,ca,pat,schneider
8,in,dsl,posting,jpl,group,toronto,cco
9,would,shameful,nntp,_____,com,alaska,morality


In [38]:
np.sort(model.components_[0])[::-1]

array([0.29128338, 0.18673048, 0.18351616, ..., 0.        , 0.        ,
       0.        ])

In [42]:
#topics
num_topics=7
#LDA 
model = LatentDirichletAllocation(n_components=num_topics)
#fit the model
model.fit(xtfidf_norm)

LatentDirichletAllocation(n_components=7)

In [43]:
def get_lda_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-n_top_words - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    
    return pd.DataFrame(word_dict)

In [44]:
get_lda_topics(model, 30)



Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07
0,egalon,op_rows,almanac,centaur,petch,centaur,edu
1,oliveira,op_cols,frog,proton,grass,almanac,com
2,cray,row,245,542,valley,proton,the
3,langley,int,604,706,sister,542,from
4,larc,col,spite,30602,daily,706,subject
5,ring,catalog,centaur,n4tmi,chuck,smokeless,organization
6,stable,noise,64,0358,deeply,unified,lines
7,planned,operator,broken,7415,whoever,restraint,it
8,converted,improvement,island,trumpet,verse,abort,in
9,combined,terms,proton,meter,gold,argumentum,re
