## Looking at the National Public Radio Dataset

In [1]:
import pandas as pd
npr = pd.read_csv('npr.csv')
# npr.head() is used to display the first few rows of the DataFrame
npr.head()


Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [6]:
len(npr)  # This will give the number of rows in the DataFrame

11992

## Accessing a Random Article to see the type of articles present in this dataset

In [8]:
npr.at[550, 'Article']

'The Trump administration is pushing forward with plans for two major oil pipelines in the U. S. projects that sparked nationwide demonstrations and legal fights under President Barack Obama. President Trump signed documents inviting the company behind the Keystone XL pipeline to resubmit a proposal for the project, which the Obama administration rejected in 2015, and instructing the Army to expedite the review and approval process for the section of the Dakota Access Pipeline that hasn’t been built. ”We’re going to renegotiate some of the terms, and if they’d like, we’ll see if they can get the pipeline built,” Trump said of the Keystone XL pipeline. ”This not a done deal,” Bill McKibben of the group 350. org, which has lobbied against pipelines for years, said in a statement. He called the pipelines ”unwise and immoral” because they contribute to climate change. Trump also signed a document requesting a federal plan to incentivize the use of U. S.  pipes for pipeline projects. The lo

## Since our task is to group articles by some topic, we will now create clusters of articles for grouping

## Preprocessing the Data

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [12]:
dtm = cv.fit_transform(npr['Article'])
# dtm is a Document-Term Matrix

In [13]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3033388 stored elements and shape (11992, 54777)>

## Performing Latent Dirichlet Allocation

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=7,random_state=45)
LDA.fit(dtm)


## Showing Stored Words

In [16]:
len(cv.get_feature_names_out())

54777

## For qualitative inspection of tokenization and filtering, 10 random words were sampled from the vocabulary learned by CountVectorizer

In [17]:
import random
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(cv.get_feature_names_out()[random_word_id])

superior
denigrate
repulsive
pollock
mehldau
downscale
replicated
starchy
somewhat
dumbest


## We made 7 clusters (topics) from our corpus

In [18]:
len(LDA.components_)

7

In [20]:
LDA.components_ # This is a matrix of shape (n_components, n_features)
# Each row corresponds to a topic, and each column corresponds to a word in the vocabulary

array([[1.43179543e-01, 2.48978752e+02, 1.42962108e-01, ...,
        1.42926884e-01, 1.42857152e-01, 2.11855427e+00],
       [6.28176999e+00, 1.80659394e+03, 1.42924166e-01, ...,
        2.49393533e-01, 2.14085638e+00, 1.43207782e-01],
       [3.97261592e-01, 8.63267228e+02, 1.42857151e-01, ...,
        6.03479239e+00, 1.43328259e-01, 1.42857152e-01],
       ...,
       [4.80093362e+01, 1.00631441e+02, 3.14268512e+00, ...,
        1.43854658e-01, 1.44383243e-01, 1.65616665e-01],
       [1.25227246e+01, 1.47621004e+02, 1.42857151e-01, ...,
        1.42857147e-01, 1.42857147e-01, 1.43125339e-01],
       [1.72170349e+00, 9.27027130e+02, 1.42857152e-01, ...,
        1.42857148e-01, 1.42857148e-01, 1.43074802e-01]])

In [21]:
len(LDA.components_[0]) # Number of words in the vocabulary

54777

In [22]:
single_topic = LDA.components_[0]

In [23]:
single_topic.argsort() 
# The result is an array of indices sorted by the value of the topic

array([18302, 44967, 35285, ..., 42749, 52786,  9767])

In [30]:
# Word least representative of the topic
single_topic[18302]

np.float64(0.14285714344849482)

In [29]:
# Word most representative of this topic
single_topic[9767]

np.float64(4637.2147441296)

In [31]:
# Top 10 words in this topic
single_topic.argsort()[-10:]

array([39320, 13431, 35843, 52782, 42561, 36310, 46581, 42749, 52786,
        9767])

In [32]:
top_word_indices = single_topic.argsort()[-10:]

In [33]:
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

race
democratic
party
vote
said
percent
state
sanders
voters
clinton


### These words looks like they belong to topic politics

### Prior to assigning topic labels via .transform() on the vectorized document matrix, we first examine the full set of 7 extracted topics.

In [34]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['won', 'just', 'new', 'win', 'election', 'race', 'democratic', 'party', 'vote', 'said', 'percent', 'state', 'sanders', 'voters', 'clinton']


THE TOP 15 WORDS FOR TOPIC #1
['time', 'million', 'make', 'said', 'years', '000', 'year', 'companies', 'just', 'like', 'new', 'percent', 'company', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #2
['researchers', 'time', 'research', 'university', 'disease', 'years', 'just', 'new', 'study', 'water', 'health', 'like', 'food', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #3
['attack', 'killed', 'military', 'according', 'years', 'told', 'country', 'government', 'war', 'reports', 'city', 'police', 'people', 'said', 'says']


THE TOP 15 WORDS FOR TOPIC #4
['ve', 'music', 'years', 'don', 'life', 'way', 'new', 'really', 'know', 'think', 'time', 'people', 'says', 'just', 'like']


THE TOP 15 WORDS FOR TOPIC #5
['election', 'political', 'told', 'donald', 'news', 'new', 'people', 'clinton', 'white', 'obama', 'house', 'campa

### Attaching Discovered Topic Labels to Original Articles

In [35]:
topic_results = LDA.transform(dtm)
topic_results.shape

(11992, 7)

In [36]:
topic_results[0] # This is the topic distribution for the first article

array([2.25456392e-04, 2.26062589e-04, 2.25414124e-04, 1.09634591e-02,
       2.25418845e-04, 9.87908704e-01, 2.25485322e-04])

In [37]:
topic_results[0].round(2) # Rounding for better readability

array([0.  , 0.  , 0.  , 0.01, 0.  , 0.99, 0.  ])

In [38]:
topic_results[0].argmax() # This gives the index of the topic with the highest probability for the first article

np.int64(5)

### This means that the article 1 belongs to topic 5

In [39]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


### Now we will combine topic labels with our original data

In [40]:
topic_results.argmax(axis=1)

array([5, 5, 5, ..., 2, 0, 1])

In [41]:
npr['Topic'] = topic_results.argmax(axis=1)

In [44]:
npr.head(15)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",5
1,Donald Trump has used Twitter — his prefe...,5
2,Donald Trump is unabashedly praising Russian...,5
3,"Updated at 2:50 p. m. ET, Russian President Vl...",5
4,"From photography, illustration and video, to d...",4
5,I did not want to join yoga class. I hated tho...,4
6,With a who has publicly supported the debunk...,2
7,"I was standing by the airport exit, debating w...",4
8,"If movies were trying to be more realistic, pe...",2
9,"Eighteen years ago, on New Year’s Eve, David F...",4
