# "Topic Modeling: NPR news with Latent dirichlet Allocation"


- title: "Topic Modeling: NPR news with Latent dirichlet Allocation"
- toc: true
- badges: False
- comments: true
- author: Sam Treacy
- categories: [topic_modeling,  latent_dirichlet_allocation, python]

In [1]:
import numpy as np
import pandas as pd

In [109]:
npr = pd.read_csv('DATA/npr.csv')
npr.head(10)

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
5,I did not want to join yoga class. I hated tho...
6,With a who has publicly supported the debunk...
7,"I was standing by the airport exit, debating w..."
8,"If movies were trying to be more realistic, pe..."
9,"Eighteen years ago, on New Year’s Eve, David F..."


In [4]:
npr.isnull().sum()

Article    0
dtype: int64

## Preprocess

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.90, min_df=2, stop_words='english')


# Create Document Term Matrix (dtm)
dtm = cv.fit_transform(npr['Article'])

## Latent Dirichlet Allocation

In [70]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [71]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

## Explore Stored words

In [72]:
len(cv.get_feature_names())

54777

In [73]:
len(LDA.components_)

7

In [74]:
LDA.components_.shape

(7, 54777)

In [75]:
LDA.components_.argmax(axis=0)

array([5, 0, 5, ..., 2, 2, 5])

In [76]:
single_topic = LDA.components_[0]

In [77]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

In [78]:
single_topic.argmin()

2475

In [79]:
# Word least representative of this topic
single_topic[18302]

0.14285714309286987

In [80]:
single_topic.argmax()

42993

In [81]:
# Word most representative of this topic
single_topic[42993]

6247.245510521084

In [82]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993])

In [83]:
top_word_index = single_topic.argsort()[-10:]

In [84]:
for index in top_word_index:
    print(cv.get_feature_names()[index])

new
percent
government
company
million
care
people
health
said
says


In [85]:
for i, topic in enumerate(LDA.components_):
    print(f'The top 15 words for topic {i}')
    print( [cv.get_feature_names()[val] for val in topic.argsort()[-15:]] )
    print('\n')

The top 15 words for topic 0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


The top 15 words for topic 1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


The top 15 words for topic 2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


The top 15 words for topic 3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


The top 15 words for topic 4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


The top 15 words for topic 5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think', 'peop

In [87]:
cv.get_feature_names()[10421] 

'companies'

### Attaching Discovered Topic Labels to Original Articles

In [95]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [94]:
dtm.shape

(11992, 54777)

In [92]:
len(npr)

11992

In [96]:
topic_results = LDA.transform(dtm)

In [97]:
topic_results.shape

(11992, 7)

In [103]:
topic_results[0]

array([1.61040465e-02, 6.83341493e-01, 2.25376318e-04, 2.25369288e-04,
       2.99652737e-01, 2.25479379e-04, 2.25497980e-04])

In [101]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 3, 4, 0])

In [105]:
topic_results[7].argmax()

2

In [115]:
npr['Topic'] = topic_results.argmax(axis=1)

In [116]:
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2


In [118]:
topics_mapping = {0:'Business', 1:'Security', 2:'Home', 3:'Health', 4:'Election', 5:'Culture', 6:'Education'}

npr['Subject'] = npr['Topic'].map(topics_mapping)

In [120]:
npr.tail(20)

Unnamed: 0,Article,Topic,Subject
11972,In a disappointment to Alzheimer’s patients an...,3,Health
11973,"In my early 20s, smitten by the mythic underpi...",5,Culture
11974,It’s been a lively year for social media maven...,1,Security
11975,This is not a review. It started out as one: I...,5,Culture
11976,"On a summer’s day in December, a warehouse in ...",2,Home
11977,"Elections aren’t exactly cozy, even in the bes...",5,Culture
11978,"Although her oldest child, Ben, is 10 years ol...",3,Health
11979,"When a political scandal explodes in France, t...",1,Security
11980,The darkest moment for American police this ye...,4,Election
11981,Russia was ordered to vacate two compounds it ...,2,Home


## Test LDA fit on unseen text

In [181]:
text = ['hospitals are recieving more patients ']

In [182]:
processed = cv.transform(text)

In [183]:
LDA.transform(processed).argmax()

3

In [174]:
topics_mapping

{0: 'Business',
 1: 'Security',
 2: 'Home',
 3: 'Health',
 4: 'Election',
 5: 'Culture',
 6: 'Education'}