# Latent Dirichlet Allocation algorithm

In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv('./UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv')

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
npr['Article'][4000]

'The headline shocked the   world of the surface Navy: Seven sailors aboard the destroyer USS Fitzgerald were killed, and other crew members injured, when the warship collided with a cargo vessel off Japan. As the Navy family grieves, both it and the wider world are asking the same question: How did this happen? The short answer is that no one knows  —   yet. Official inquiries into what led up to the encounter could take months or more. The Navy and the U. S. Coast Guard both likely will eventually issue reports that describe what happened and could make recommendations for preventing another such accident. ”I will not speculate on how long these investigations will last,” said Vice Adm. Joseph Aucoin, commander of the Navy’s 7th Fleet. The Fitzgerald and the other ships of Destroyer Squadron 15, based outside Tokyo, fall under his authority. There are clues, however, that explain how something like the Fitzgerald’s collision could happen, including photographs of the ships involved, 

## Preprocessing

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

Exlude high frequency terms

In [8]:
# max_df=integer or decimal: exclude words that appear in this percentage of
# documents (i.e., 0.8), or if an integer, exclude words that appear in this
# number of documents (i.e., 4)

# min_df=integer or decimal: only include words that appear in at least this percentage
# of documents (i.e., 0.8), or if an integer, only include words that appear in this
# number of documents (i.e., 4)

# stop_words='english': exclude stop words like 'a', 'the', 'it', etc.
cv = CountVectorizer(max_df=0.9,min_df=2,stop_words='english')

In [9]:
dtm = cv.fit_transform(npr['Article'])

In [10]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

## Train LDA model on documents

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

In [12]:
# n_components= number of components or topics the model will use
LDA = LatentDirichletAllocation(n_components=7,random_state=42)

In [14]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

Grab the vocabulary of words

In [15]:
len(cv.get_feature_names())

54777

In [16]:
type(cv.get_feature_names())

list

In [20]:
import random

random_word_id = random.randint(0,54777)

cv.get_feature_names()[10]

'01'

Grab the topics

In [21]:
len(LDA.components_)

7

In [22]:
type(LDA.components_)

numpy.ndarray

In [23]:
LDA.components_.shape

(7, 54777)

In [24]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [25]:
single_topic = LDA.components_[0]

In [26]:
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [27]:
import numpy as np

In [28]:
arr = np.array([10,200,1])

In [29]:
arr

array([ 10, 200,   1])

In [31]:
arr.argsort()

array([2, 0, 1], dtype=int64)

In [34]:
# argsort() returns back index positions sorted from least -> greatest
# top 10 values (10 greatest values)
# last 10 values of argsort()
top_ten_words = single_topic.argsort()[-10:]  # grab the last 10 values of argsort()

In [36]:
for index in top_ten_words:
    print(cv.get_feature_names()[index])

new
percent
government
company
million
care
people
health
said
says


In [38]:
top_twenty_words = single_topic.argsort()[-20:]  # grab the last 20 values of argsort()

In [39]:
for index in top_twenty_words:
    print(cv.get_feature_names()[index])

president
state
tax
insurance
trump
companies
money
year
federal
000
new
percent
government
company
million
care
people
health
said
says


Grab the highest probability words per topic

In [41]:
for i,topic in enumerate(LDA.components_):
    print(f"The top 15 words for topic #{i}")
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')
    print('\n')

The top 15 words for topic #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']




The top 15 words for topic #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




The top 15 words for topic #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




The top 15 words for topic #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




The top 15 words for topic #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']




The top 15 words for topic #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know'

In [42]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [43]:
npr

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
5,I did not want to join yoga class. I hated tho...
6,With a who has publicly supported the debunk...
7,"I was standing by the airport exit, debating w..."
8,"If movies were trying to be more realistic, pe..."
9,"Eighteen years ago, on New Year’s Eve, David F..."


## Add topic column to Pandas dataframe 

In [44]:
topic_results = LDA.transform(dtm)

In [45]:
topic_results.shape

(11992, 7)

In [46]:
topic_results[0]

array([1.61040465e-02, 6.83341493e-01, 2.25376318e-04, 2.25369288e-04,
       2.99652737e-01, 2.25479379e-04, 2.25497980e-04])

In [47]:
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [49]:
topic_results[0].argmax()

1

In [50]:
npr['Topic'] = topic_results.argmax(axis=1)

In [51]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2
