# LDA: Latent Dirichlet Allocation With Python

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv(r"C:\Users\HARDIK\NLP END TO END\NLP_COURSE_HELP\05-Topic-Modeling\npr.csv")
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [2]:
df.isnull().sum()

Article    0
dtype: int64

In [3]:
df.shape

(11992, 1)

In [6]:
len(df['Article'][0])

7646

### Preprocessing

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df = 0.95,         # Throw out Most common words 
                     min_df = 2,            # Throw out Most random words
                     stop_words='english')          
# 2 : This says that the min_freq for a word to be counted into this CountVectorizer it has to show up in 2 doc
# Ignore the max document frequency i.e discard the words which is present in 95% of doc

As this is <b>unsupervied learning</b> we are not doing any train test split

In [8]:
doc_tm = cv.fit_transform(df['Article'])

doc_tm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

### Implementing LDA

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=7,            # This means we want 7 general topics return
                                random_state=42)      

LDA.fit(doc_tm)

LatentDirichletAllocation(n_components=7, random_state=42)

### Imporatant 3 Steps

- (1) Grab the vocab of the words
- (2) Grab the topics
- (3) Grab the highest probability words per topic 

### 1.) Grab the Vocabulary of words

In [10]:
len(cv.get_feature_names())

54777

In [11]:
type(cv.get_feature_names())

list

In [12]:
cv.get_feature_names()[50000]

'transcribe'

In [13]:
cv.get_feature_names()[41000]

'reproductive'

In [16]:
# getting the list of random words

import random

random_word_id = random.randint(0,54777)

cv.get_feature_names()[random_word_id]

'dispelling'

### 2.) Grab the Topics

In [17]:
len(LDA.components_)

7

In [18]:
type(LDA.components_)

numpy.ndarray

In [19]:
LDA.components_.shape

(7, 54777)

In [20]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

Now we have to combine:
    
LDA.components_ + cv.get_feature_names()[random_word_id]

i.e LDA.components_ is combined with ability to grab vocab in order to show you

<b>The highest probability words per topics</b>

In [21]:
single_topic = LDA.components_[0]

In [22]:
single_topic.argsort()      # Returns the index position which sorts this array

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [23]:
arr = np.array([10,200,1])

arr

array([ 10, 200,   1])

In [24]:
arr.argsort()

array([2, 0, 1], dtype=int64)

Sorted by index

In [25]:
# argsort ------> returns index pos from the least to greatest

single_topic.argsort()[-10:]  # top 10 greatest values

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993], dtype=int64)

In [30]:
top_20_words = single_topic.argsort()[-20:]

In [31]:
for index in top_20_words:
    print(cv.get_feature_names()[index])

president
state
tax
insurance
trump
companies
money
year
federal
000
new
percent
government
company
million
care
people
health
said
says


#### loop for top 15 words for each of the 7 topics

### 3.) Grab the highest probability words per topic 

In [33]:
for index, topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{index}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n \n')

THE TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']

 

THE TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']

 

THE TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']

 

THE TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']

 

THE TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']

 

THE TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know'

***This were probability of words belonging to a Topics :***

- 0 : Health care
- 1 : Military
- 2 : Infrastructure
- 3 : General Health
- 4 : Election
- 5 : Life style
- 6 : Education

### Attach these topics number to the original articales:

In [34]:
doc_tm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [35]:
df

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
...,...
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...


In [36]:
topic_results = LDA.transform(doc_tm)

topic_results.shape

(11992, 7)

In [37]:
topic_results[0]

array([1.61040465e-02, 6.83341493e-01, 2.25376318e-04, 2.25369288e-04,
       2.99652737e-01, 2.25479379e-04, 2.25497980e-04])

***Now we are seeing probability of document belonging to a topic***

In [38]:
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

The vey first article at index pos 0. It's highest probability is around 68% belonging to topic num 1.

In [40]:
# df['Article'][1]

In [41]:
# return index pos of the highest probability

topic_results[0].argmax()

1

In [42]:
df['Topic'] = topic_results.argmax(axis=1)

In [43]:
df

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4


In [45]:
# if we wanna see what does topic 5 represent we will check what was the high probability words of topic 5

single_topic = LDA.components_[5]

top_words = single_topic.argsort()[-20:]

for index in top_words:
    print(cv.get_feature_names()[index])

book
world
says
things
story
years
going
ve
life
don
new
way
music
really
time
know
think
people
just
like
