I applied the (amazing) work from Ahmed Besbes to this dataset:

http://ahmedbesbes.com/how-to-mine-newsfeed-data-and-extract-interactive-insights-in-python.html

In [3]:
# import packages
import requests
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
# list of stopwords like articles, preposition
from string import punctuation
from collections import Counter
import re
stop = set(stopwords.words('english'))

In [4]:
data = pd.read_csv("C:/Users/Admin/Desktop/Machine Learning/News-aggregator.csv",nrows=10000)

In [5]:
data.shape

(10000, 8)

In [6]:
print('data shape:', data.shape)

data shape: (10000, 8)


In [7]:
data.CATEGORY.value_counts().plot(kind='bar', grid=True, figsize=(16, 9))

<matplotlib.axes._subplots.AxesSubplot at 0xd56d3b0>

In [8]:
# remove rows with empty Titles
data = data[~data['TITLE'].isnull()]

In [9]:
data['len'] = data['TITLE'].map(len)

Tokenization

In [10]:
def tokenizer(text):
    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]

    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent

    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', 
                                        u'\u2014', u'\u2026', u'\u2013'], tokens))
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

    return filtered_tokens


In [11]:
data['tokens'] = data['TITLE'].map(tokenizer)

In [12]:
for title, tokens in zip(data['TITLE'].head(5), data['tokens'].head(5)):
    print('title:', title)
    print('tokens:', tokens)
    print() 

title: Fed official says weak data caused by weather, should not slow taper
tokens: ['fed', 'official', 'says', 'weak', 'data', 'caused', 'weather', 'slow', 'taper']

title: Fed's Charles Plosser sees high bar for change in pace of tapering
tokens: ['fed', 'charles', 'plosser', 'sees', 'high', 'bar', 'change', 'pace', 'tapering']

title: US open: Stocks fall after Fed official hints at accelerated tapering
tokens: ['us', 'open', 'stocks', 'fall', 'fed', 'official', 'hints', 'accelerated', 'tapering']

title: Fed risks falling 'behind the curve', Charles Plosser says
tokens: ['fed', 'risks', 'falling', "'behind", 'curve', 'charles', 'plosser', 'says']

title: Fed's Plosser: Nasty Weather Has Curbed Job Growth
tokens: ['fed', 'plosser', 'nasty', 'weather', 'curbed', 'job', 'growth']



In [13]:
def keywords(category):
    tokens = data[data['CATEGORY'] == category]['tokens']
    alltokens = []
    for token_list in tokens:
        alltokens += token_list
    counter = Counter(alltokens)
    return counter.most_common(10)

In [14]:
for category in set(data['CATEGORY']):
    print('category :', category)
    print('top 10 keywords:', keywords(category))
    print('---')

category : m
top 10 keywords: [('test', 177), ('alzheimer', 170), ('blood', 156), ('drug', 98), ('predict', 80), ('study', 73), ('company', 64), ('cancer', 64), ('health', 60), ('may', 59)]
---
category : e
top 10 keywords: [('bieber', 421), ('bachelor', 402), ('justin', 396), ('juan', 389), ('pablo', 387), ("'the", 264), ('selena', 256), ('gomez', 247), ('finale', 233), ('season', 227)]
---
category : b
top 10 keywords: [('china', 351), ('us', 259), ('gold', 215), ('stocks', 214), ('bank', 212), ('ukraine', 193), ('mortgage', 141), ('herbalife', 140), ('crush', 139), ('data', 135)]
---
category : t
top 10 keywords: [('titanfall', 347), ('xbox', 254), ('google', 224), ('one', 221), ('snowden', 211), ('gm', 193), ('new', 149), ('recall', 147), ('ios', 140), ('apple', 138)]
---


TFIDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# min_df is minimum number of documents that contain a term t
# max_features is maximum number of unique tokens (across documents) that we'd consider
# TfidfVectorizer preprocesses the descriptions using the tokenizer we defined above

vectorizer = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=tokenizer, ngram_range=(1, 2))
vz = vectorizer.fit_transform(list(data['TITLE']))

In [16]:
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [17]:
tfidf.tfidf.hist(bins=50, figsize=(15,7))

<matplotlib.axes._subplots.AxesSubplot at 0xd56d3b0>

In [18]:
tfidf.sort_values(by=['tfidf'], ascending=True).head(30)

Unnamed: 0,tfidf
new,4.028355
bieber,4.182162
us,4.182162
bachelor,4.241733
justin,4.244294
juan,4.246861
pablo,4.252016
juan pablo,4.254603
justin bieber,4.316038
china,4.352507


In [19]:
tfidf.sort_values(by=['tfidf'], ascending=False).head(30)

Unnamed: 0,tfidf
pot sales,7.812545
deadly,7.812545
ban gujarat,7.812545
stap,7.812545
deposition videos,7.812545
lohan docu-series,7.812545
speed reading,7.812545
deletes,7.812545
ltd.,7.812545
bank mortgage,7.812545


In [18]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz)

In [19]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.090s...
[t-SNE] Computed neighbors for 10000 samples in 14.761s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 67.277939
[t-SNE] Error after 1000 iterations: 0.627873


In [22]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [23]:
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="tf-idf clustering of the news",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [24]:
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['title'] = data['TITLE']
tfidf_df['category'] = data['CATEGORY']

In [25]:
plot_tfidf.scatter(x='x', y='y', source=tfidf_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"title": "@title", "category":"@category"}
show(plot_tfidf)

In [26]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.cluster import MiniBatchKMeans

num_clusters = 30
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

In [27]:
for (i, desc),category in zip(enumerate(data.TITLE),data['CATEGORY']):
    if(i < 5):
        print("Cluster " + str(kmeans_clusters[i]) + ": " + desc + 
              "(distance: " + str(kmeans_distances[i][kmeans_clusters[i]]) + ")")
        print('category: ',category)
        print('---')

Cluster 7: Fed official says weak data caused by weather, should not slow taper(distance: 0.995780836149)
category:  b
---
Cluster 7: Fed's Charles Plosser sees high bar for change in pace of tapering(distance: 0.998700015808)
category:  b
---
Cluster 7: US open: Stocks fall after Fed official hints at accelerated tapering(distance: 0.994984544484)
category:  b
---
Cluster 7: Fed risks falling 'behind the curve', Charles Plosser says(distance: 0.998529365093)
category:  b
---
Cluster 7: Fed's Plosser: Nasty Weather Has Curbed Job Growth(distance: 0.998704222589)
category:  b
---


In [28]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i)
    aux = ''
    for j in sorted_centroids[i, :10]:
        aux += terms[j] + ' | '
    print(aux)
    print() 

Cluster 0:
admits | shocking | star | deposition | justin bieber | justin | bieber | 48gb | install | pc | 

Cluster 1:
dunham | lena | lena dunham | snl | naked | 'girls | 'snl | molestation | night live | night | 

Cluster 2:
chiquita brands | brands | international | chiquita | fyffes | new york | york | housewives new | merger | inc. | 

Cluster 3:
titanfall xbox | resolution | ready | xbox one | xbox | one | titanfall | release | update | 792p | 

Cluster 4:
happy | ecb | euro | update | man | allow | truck | food | rise | data | 

Cluster 5:
wearhouse | jos | men | men wearhouse | a. | jos a. | bank | a. bank | deal | billion | 

Cluster 6:
bmo | fixed | better | interest rates | interest | deal | rates | may | dirty | dirty dancing | 

Cluster 7:
new | china | us | gm | titanfall | recall | google | gold | stocks | ukraine | 

Cluster 8:
young | neil young | neil | music | player | kickstarter | pono | ponomusic | music player | launch | 

Cluster 9:
sick | medication | boy | gi

In [29]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.282s...
[t-SNE] Computed neighbors for 10000 samples in 8.962s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.457863
[t-SNE] Error after 1000 iterations: 0.705513


In [30]:
import numpy as np

colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5",
"#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981",
"#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c",
"#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79"])

plot_kmeans = bp.figure(plot_width=700, plot_height=600, title="KMeans clustering of the news",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [31]:
kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['title'] = data['TITLE']
kmeans_df['category'] = data['CATEGORY']

In [96]:
data_china=data[(data['TITLE'].str.contains('China')) & data['CATEGORY'].str.contains('b')]

In [93]:
for category in set(data['CATEGORY']):
    for keyword in keywords(category):
        print(keyword)

<class 'tuple'>
('china', 351)
<class 'tuple'>
('us', 259)
<class 'tuple'>
('gold', 215)
<class 'tuple'>
('stocks', 214)
<class 'tuple'>
('bank', 212)
<class 'tuple'>
('ukraine', 193)
<class 'tuple'>
('mortgage', 141)
<class 'tuple'>
('herbalife', 140)
<class 'tuple'>
('crush', 139)
<class 'tuple'>
('data', 135)
<class 'tuple'>
('titanfall', 347)
<class 'tuple'>
('xbox', 254)
<class 'tuple'>
('google', 224)
<class 'tuple'>
('one', 221)
<class 'tuple'>
('snowden', 211)
<class 'tuple'>
('gm', 193)
<class 'tuple'>
('new', 149)
<class 'tuple'>
('recall', 147)
<class 'tuple'>
('ios', 140)
<class 'tuple'>
('apple', 138)
<class 'tuple'>
('test', 177)
<class 'tuple'>
('alzheimer', 170)
<class 'tuple'>
('blood', 156)
<class 'tuple'>
('drug', 98)
<class 'tuple'>
('predict', 80)
<class 'tuple'>
('study', 73)
<class 'tuple'>
('company', 64)
<class 'tuple'>
('cancer', 64)
<class 'tuple'>
('health', 60)
<class 'tuple'>
('may', 59)
<class 'tuple'>
('bieber', 421)
<class 'tuple'>
('bachelor', 402)
<cl

In [83]:
d["string6"]

KeyError: 'string6'