# Modeling Topics for Numerous Articles

In [21]:
#import modules
from pathlib import Path
import numpy as np
import pandas as pd
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# set path variables
DATAPATH = Path('data')

## Exploratory Data Analysis

In [3]:
# import data
df = pd.read_csv(DATAPATH / "npr.csv")
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
# get an overview of a specific article
df.Article[0][:1000]

'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long called for similar or more severe measures could scarcely bring themselves to approve. House Speaker Paul Ryan called the Obama measures ”appropriate” but also ”overdue” and ”a prime example of this administration’s ineffective foreign policy that has left America weaker in the eyes of the world.” Other GOP leaders sounded much the same theme. ”[We have] been urging President Obama for years to take strong action to deter Russia’s worldwide aggression, including its   operations,” wrote Rep. Devin Nunes,  . chairman of the House Intelligence Committee. ”Now with just a few weeks left in office, the president has suddenly decided that some stronger measures are indeed warranted.” Appearing 

In [5]:
# get number of articles
len(df.Article)

11992

## Data Preparation & Modeling

In [6]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [8]:
# init the countvectorizer object
cv = CountVectorizer(lowercase=True,
                     stop_words='english',
                     tokenizer=LemmaTokenizer(),
                     ngram_range=(1,1),
                     max_df=.9, 
                     min_df=.05,
                     max_features=1000 
                     )

In [9]:
# fit and transform the cv object to create a document term matrix
dtm = cv.fit_transform(df.Article)



In [10]:
# check type and shape of dtm
print('type:', type(dtm))
print('shape:', dtm.shape)

type: <class 'scipy.sparse.csr.csr_matrix'>
shape: (11992, 1000)


In [11]:
# check sparsity - high sparsity expected
dtm.toarray()

array([[1, 0, 4, ..., 0, 0, 4],
       [0, 0, 0, ..., 1, 0, 7],
       [0, 0, 0, ..., 0, 0, 4],
       ...,
       [0, 1, 1, ..., 0, 1, 6],
       [0, 0, 0, ..., 0, 1, 2],
       [1, 0, 0, ..., 0, 0, 2]])

In [12]:
# get the terms
cv.get_feature_names()[-10:]



['writes',
 'writing',
 'written',
 'wrong',
 'wrote',
 'year',
 'yes',
 'york',
 'young',
 '—']

In [13]:
# init the LDA object
LDA = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                n_jobs=-1)

In [14]:
# fit the LDA object
LDA.fit(dtm)

LatentDirichletAllocation(n_jobs=-1, random_state=123)

In [19]:
# have a look at the fit result
LDA.components_[:10]

array([[8.36515743e+01, 1.28269818e+02, 1.83806835e+02, ...,
        4.56414776e+02, 1.00010983e-01, 2.25336935e+03],
       [1.19685913e+03, 6.02875484e+01, 8.92456287e+03, ...,
        8.48952203e+02, 6.89577727e+02, 1.28795068e+04],
       [4.58775135e-01, 5.85194503e+01, 1.09265883e+02, ...,
        2.12916494e+02, 3.58941038e+01, 1.42100334e+03],
       ...,
       [4.77541197e+01, 8.16367884e+00, 2.20746821e+02, ...,
        5.26390041e+01, 4.84360543e+02, 4.53431364e+03],
       [9.81906155e+01, 2.58054938e+02, 5.42337740e+01, ...,
        4.50846143e+02, 2.73468847e+02, 4.15665121e+03],
       [2.12523172e+01, 2.65087426e+02, 3.34368177e+01, ...,
        8.31035959e+01, 6.17717113e+01, 1.20310074e+03]])

In [17]:
# get type and shape of topics
print('type: ', type(LDA.components_))
print('shape: ', LDA.components_.shape)

type:  <class 'numpy.ndarray'>
shape:  (10, 1000)


In [37]:
# get the index position of the top 3 terms in a topic 
topic_num = 0
top_n = 3
LDA.components_[topic_num].argsort()[-top_n:]

array([769, 675, 920])

In [55]:
for ind in LDA.components_[topic_num].argsort()[-top_n:]:
    print('index: ', ind)
    print('term: ', cv.get_feature_names_out()[ind])
    

index:  769
term:  said
index:  675
term:  president
index:  920
term:  trump


In [56]:
[cv.get_feature_names_out()[ind] for ind in LDA.components_[topic_num].argsort()[-top_n:]]

['said', 'president', 'trump']

In [59]:
def get_topic_terms(lda_obj, cv_obj, topic_num=0, top_n=3):
    # get the index position of the top 3 terms in a topic
    # input the fitted laten dirichtlet object
    # input the fitted count-vectorizer object
    # input the topic number
    # input the top-n words belonging to each topic
    # output the top_n words for topic_num  
    return [cv_obj.get_feature_names_out()[ind] for ind in lda_obj.components_[topic_num].argsort()[-top_n:]]

In [70]:
get_topic_terms(lda_obj=LDA, cv_obj=cv, topic_num=2, top_n=10)

['federal',
 'ha',
 'justice',
 'state',
 'said',
 'woman',
 'case',
 'wa',
 'law',
 'court']