# Modeling Topics for Numerous Articles

## Check Azure Machine Learning Connection

In [1]:
# Import the azureml-core package and checking the version of the SDK that is installed
import azureml.core
print("Ready to use Azure ML", azureml.core.VERSION)

# connect to workspace
from azureml.core import Workspace
ws = Workspace.from_config()
print('Workspace', ws.name, "loaded")

# view azure compute ressources
print("Compute Resources:")
for compute_name in ws.compute_targets:
    compute = ws.compute_targets[compute_name]
    print("\t", compute.name, ':', compute.type)

Ready to use Azure ML 1.41.0
Workspace sd_ml loaded
Compute Resources:
	 myCluster : AmlCompute
	 myComputer : ComputeInstance


## Import Modules

In [2]:
# import modules
from pathlib import Path
import numpy as np
import pandas as pd
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle

In [3]:
# set path variables
DATAPATH = Path('data')
MODELPATH = Path('model')

## Exploratory Data Analysis

In [4]:
# import data
df = pd.read_csv(DATAPATH / "npr.csv")
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [5]:
# get an overview of a specific article
df.Article[0][:1000]

'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long called for similar or more severe measures could scarcely bring themselves to approve. House Speaker Paul Ryan called the Obama measures ”appropriate” but also ”overdue” and ”a prime example of this administration’s ineffective foreign policy that has left America weaker in the eyes of the world.” Other GOP leaders sounded much the same theme. ”[We have] been urging President Obama for years to take strong action to deter Russia’s worldwide aggression, including its   operations,” wrote Rep. Devin Nunes,  . chairman of the House Intelligence Committee. ”Now with just a few weeks left in office, the president has suddenly decided that some stronger measures are indeed warranted.” Appearing 

In [6]:
# get number of articles
len(df.Article)

11992

## Data Preparation

In [7]:
# write custom tokenizer class to be passed to CountVectorizer instance 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        lemma_tokens = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        lemma_tokens_alpha = [t for t in lemma_tokens if t.isalpha()]
        return lemma_tokens_alpha

In [8]:
# init the countvectorizer object
cv = CountVectorizer(lowercase=True,
                     stop_words='english',
                     tokenizer=LemmaTokenizer(),
                     ngram_range=(1,1),
                     max_df=.9, 
                     min_df=.05,
                     max_features=1000 
                     )

In [9]:
# fit and transform the cv object to create a document term matrix
dtm = cv.fit_transform(df.Article)



In [10]:
# check type and shape of dtm
print('type:', type(dtm))
print('shape:', dtm.shape)

type: <class 'scipy.sparse.csr.csr_matrix'>
shape: (11992, 979)


In [11]:
# check sparsity - high sparsity expected
dtm.toarray()

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
# get the terms
cv.get_feature_names()[-10:]



['writer',
 'writes',
 'writing',
 'written',
 'wrong',
 'wrote',
 'year',
 'yes',
 'york',
 'young']

## Modeling

In [13]:
# init the LDA object
LDA = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                n_jobs=-1)

In [14]:
# fit the LDA object
LDA.fit(dtm)

LatentDirichletAllocation(n_jobs=-1, random_state=123)

In [15]:
# have a look at the fit result i.e. the topics
LDA.components_[:10]

array([[1.43195650e+02, 2.34134679e+02, 6.01013378e+02, ...,
        6.82072223e+00, 5.01368642e+02, 1.06639117e-01],
       [1.14387199e+02, 1.98434510e+02, 1.47439641e+02, ...,
        5.62634844e+01, 1.61976624e+02, 1.43655510e+03],
       [2.98953324e+01, 3.09574025e+02, 4.07509224e+02, ...,
        1.75718095e+01, 7.06604170e+01, 1.03160090e+02],
       ...,
       [2.77909753e+02, 6.15394106e+02, 2.30462459e+02, ...,
        6.10713898e+01, 1.86539814e+02, 1.35200342e+02],
       [1.10928781e+02, 1.37682785e+02, 6.74706397e+01, ...,
        8.77417950e+01, 6.71249813e+02, 9.21314930e+00],
       [3.63136927e+01, 7.67916003e+01, 2.43411914e+01, ...,
        6.45430157e+01, 3.26636876e+02, 3.04070900e+02]])

In [16]:
# get type and shape of topics
print('type: ', type(LDA.components_))
print('shape: ', LDA.components_.shape)

type:  <class 'numpy.ndarray'>
shape:  (10, 979)


## Model Evaluation

In [17]:
def get_article_topic(lda_obj, article_num):
    """ input LDA object and the number of the article to be scored.
    Return the topic number for that article"""
    topic_num = np.argmax(lda_obj.transform(dtm[article_num]))
    return topic_num 

def get_topic_terms(lda_obj, cv_obj, topic_num=0, top_n=3):
    """ get the index position of the top 3 terms in a topic.
    input the fitted laten dirichtlet object.
    input the fitted count-vectorizer object.
    input the topic number.
    input the top-n words belonging to each topic.
    output the top_n words for topic_num. """
      
    return [cv_obj.get_feature_names_out()[ind] for ind in lda_obj.components_[topic_num].argsort()[-top_n:]]

In [21]:
article_num = 0 # select article number
top_n = 5 # select top n terms
topic_num = get_article_topic(LDA, article_num)
print('The topic for article number {} is {}'.format(article_num, topic_num))

top_terms = get_topic_terms(LDA, cv, topic_num, top_n)
print('The top {} terms in article number {} are: \n {}'.format(top_n, article_num, top_terms))

The topic for article number 0 is 8
The top 5 terms in article number 0 are: 
 ['clinton', 'wa', 'president', 'said', 'trump']


In [26]:
df.Article[0][:500]

'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long called for similar or more severe measures could scarcely bring themselves to approve. House Speaker Paul Ryan called the Obama measures ”appropriate” but also ”overdue” and ”a prime example of this '

In [32]:
# print the top terms for the first 10 articles
top_n = 10 # select top n terms
for article_num in range(0,10):
    print('Article #{}:'.format(article_num))
    print(df.Article[article_num][:200])
    topic_num = get_article_topic(LDA, article_num)
    top_terms = get_topic_terms(LDA, cv, topic_num, top_n)
    print('Article number #{} contains topic #{}'.format(article_num, topic_num))
    print('The top {} terms for topic #{} are: \n {}'.format(top_n, topic_num, top_terms))
    print('\n')


Article #0:
In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russ
Article number #0 contains topic #8
The top 10 terms for topic #8 are: 
 ['republican', 'obama', 'house', 'campaign', 'ha', 'clinton', 'wa', 'president', 'said', 'trump']


Article #1:
  Donald Trump has used Twitter  —   his preferred means of communication  —   to weigh in on a swath of foreign policy issues over the past few weeks. His comments give a glimpse into how his incomin
Article number #1 contains topic #8
The top 10 terms for topic #8 are: 
 ['republican', 'obama', 'house', 'campaign', 'ha', 'clinton', 'wa', 'president', 'said', 'trump']


Article #2:
  Donald Trump is unabashedly praising Russian President Vladimir Putin, a day after outgoing President Obama issued tough sanctions against the country in response to alleged cyberattacks intended to
Article number 

## Save trained models

In [20]:
# save trained cv model
pickle.dump(cv, open(MODELPATH / 'cv_model', 'wb'))

# save trained LDA model
pickle.dump(LDA, open(MODELPATH / 'lda_model', 'wb'))