In [530]:
import json_utils as ju
import datetime
import pandas as pd
import numpy as np
import nltk.cluster
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from IPython.display import clear_output

In [437]:
# Config
stops = set(stopwords.words('english'))
jsonfiles = ['chicagotribune0.json','chicagotribune1.json',\
             'chicagotribune2.json','chicagotribune3.json']
cluster_num = 10

In [374]:
# Function defs
def article_tokenize(article):
    article_clean = str(article[1:][0][:-1])
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(article_clean)
    tokens = [i.lower() for i in tokens if i not in stops]
    return tokens

def extract_article_date(article):
    return article[1][-1][1]

In [376]:
# Load articles from JSON
jsons = ju.get_jsons(jsonfiles,'')
article = next(jsons)

In [377]:
# Clean articles
article_tokens = []
parsed_dates = []
for article in jsons:
    article_temp = ju.get_article_text(article['html'])
    
    # Parse failures always have length 1; 2+ otherwise
    if (len(article_temp) > 1):
        tokens = article_tokenize(article_temp)
        article_tokens.append(tokens)
        if (len(article_temp[1]) > 1):
            article_date = extract_article_date(article_temp)
            parsed_dates.append(article_date)
        else:
            parsed_dates.append('')

print("Tribune article parsing complete.")
parsed_dates = pd.to_datetime(parsed_dates)



  soup = BeautifulSoup(html)


Tribune article parsing complete.




In [412]:
# Join tokens to create documents
text = [' '.join(i) for i in article_tokens]

In [435]:
# Doc2Vec modeling
tagged_documents = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)])\
                    for i, _d in enumerate(text)]

max_epochs = 50
vector_size = 50
alpha = 0.025

model = Doc2Vec(dm=1,
                vector_size=vector_size,
                alpha=alpha, 
                min_alpha=0.0025,
                min_count=2)
  
model.build_vocab(tagged_documents)

for epoch in range(max_epochs):
    clear_output()
    print('Training iteration ' + str(epoch+1) + ' of ' + str(max_epochs) + "...")
    model.train(tagged_documents,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
model.save("news.model")

Training iteration 50 of 50...


In [486]:
# Measure distance between documents
document_vectors = []
for i in range(0,len(article_tokens)):
    vec = model.infer_vector(text[i])
    document_vectors.append(vec)

In [None]:
# Generate clusters
kmeans = nltk.cluster.KMeansClusterer(cluster_num, avoid_empty_clusters=True,\
                                          distance=nltk.cluster.util.cosine_distance,\
                                          repeats=20)
clusters = kmeans.cluster(document_vectors, assign_clusters=True)
cluster_counts = pd.Series(clusters).value_counts()

In [586]:
# Describe clusters by most frequent words
top_term_list = []
for i in range(len(cluster_counts)):
    docs = np.where(np.array(clusters) == i)[0]
    terms = np.asarray(article_tokens)[docs]
    term_counts = pd.Series([item for sublist in terms for item in sublist]).value_counts()
    top_terms = term_counts.sort_values(ascending=False)[:50]
    top_term_list.append(top_terms)
    

In [521]:
# Cluster time series
clusterDF = pd.DataFrame({'Cluster': clusters, 'Time': parsed_dates,
                         'Month': parsed_dates.to_period('M'),
                          'Week': parsed_dates.to_period('W'),
                         'Article Count': 1})
clusterDF.sort_values(by='Time',inplace=True)
clusterDF.set_index('Time',inplace=True)
cluster_time_series = clusterDF.groupby(['Week','Cluster']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Article Count
Week,Cluster,Unnamed: 2_level_1
2013-12-16/2013-12-22,9,1
2014-10-06/2014-10-12,7,1
2015-02-16/2015-02-22,7,1
2015-04-13/2015-04-19,6,1
2015-04-13/2015-04-19,8,2
2015-06-15/2015-06-21,7,2
2015-07-06/2015-07-12,7,1
2015-08-31/2015-09-06,5,1
2015-10-05/2015-10-11,7,1
2015-10-19/2015-10-25,7,1
