# LDA Analysis

Use LDA to take dataset of documents and sort into buckets.

Use K Means to group similar buckets into clusters.

Plot the fraction of documents in a cluster over time

In [None]:
import pickle
import datetime
from dateutil.parser import parse

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF

import numpy as np


import nltk
from nltk.tokenize import sent_tokenize
#from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import matplotlib.pyplot as plt

#nltk.download()

In [None]:
# read data from pickle file
with open ('data/all_data_desk_nohole.pkl', 'rb') as fp:
    df = pickle.load(fp)
    
df.shape

In [None]:
# look at how many articles come from each news desk to choose which to include
foo = df.groupby('news_desk').count()
foo = foo.sort_values(by=['date'],ascending=False)
foo.iloc[61:120]

In [None]:
# there's some bad data in the snippets.  Drop the rows that have NaN to clean this up
df.dropna(inplace=True)

# choose which news desks to include
news_desk_use = ['Business',
                 'Foreign',
                 'NewsDesk',
                 'National',
                 'Politics',
                 'U.S.',
                 'U.S. / Politics',
                 'U.S. / Election 2016',
                 'Washington',
                 'World / Europe',
                 'World / Middle East',
                 'World / Asia Pacific',
                 'World / Africa',
                 'World / Americas']
df2 = pd.DataFrame()
for desk in news_desk_use:
    topic = df['news_desk'] == desk
    df2 = pd.concat([df2,df[topic]])
    
# pull a fraction of the dataset for testing
#df2 = df2.sample(200)

# create an new dataframe that combines the headline and snippet (more words in document)
df2['head_snip'] = df2['headline'] + ' '+ df2['snippet']

dates = list(df2['date'].copy())

documents = list(df2['head_snip'].copy())

df2.shape

# write data to pickle file
#with open('data/df2.pkl', 'wb') as fp:
#    pickle.dump(df2, fp)
    

In [None]:
df2.sample(10)

In [None]:
# write files to csv for transfer to AWS
df2['date'].to_csv('data/date.csv')
df2['head_snip'].to_csv('data/documents.csv')

In [None]:
# read files from csv on AWS
date_df = pd.read_csv('data/date.csv')
documents_df = pd.read_csv('data/documents.csv')
date = date_df.iloc[:,1].tolist()
documents = documents_df.iloc[:,1].tolist()

In [None]:
df2.groupby('news_desk').count()

## functions

In [None]:
# for LDA show tho topics with the highest probability for a bucket
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print(f"Topic {topic_idx}")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print('document with index',doc_index)
            print(documents[doc_index])
        print(' ')

In [None]:
# plot the trends in LDA buckets over time
# can choose month or week

def plot_topic_trend_lda(H, W, documents, dates):
    plt.figure(figsize=(17,8))
    min_date = min(dates)
    max_date = max(dates)
    min_year, min_week, weekday = min_date.isocalendar()
    max_year, max_week, weekday = max_date.isocalendar()

    plot_data_all = pd.DataFrame(columns=['topic','date','num_docs'])
    
    print('set up dataframe for data', plot_data_all.info())

    #timeframe = 'week'
    timeframe = 'month'
    if timeframe == 'week':
        week_start = min_week
        week_end = (max_year-min_year)*52+max_week + 1

        w = week_end - week_start
        
        for topic in range(len(H)):
            for week in range(week_start,week_end): 
                plot_data_all.at[w*topic+week,'topic'] = topic
                plot_data_all.at[w*topic+week,'date'] = week
        print('weeks in dataframe', week_start, week_end, w)

    elif timeframe == 'month':
        month_start = min_date.month
        max_month = max_date.month
        month_end = (max_year-min_year)*12+max_month + 1
        
        m = month_end - month_start
        
        for topic in range(len(H)):
            for month in range(month_start,month_end): 
                plot_data_all.at[m*topic+month,'topic'] = topic
                plot_data_all.at[m*topic+month,'date'] = month
        print('months in dataframe', month_start, month_end, m)
    
    plot_data_all['num_docs'] = 0
    plot_data_all = plot_data_all.astype('int')    
    
    #print(plot_data_all)
    
    for doc_idx in range(W.shape[0]):
        try:
            topic_idx = W[doc_idx].argmax()

            date_key = dates[doc_idx]
            #print('date_key',doc_idx,date_key)
            if timeframe == 'week':
                year, week, weekday = date_key.isocalendar()

                if year == 2016:   # 2015 gets weeks 1-52, subsequent years have to add multiples of 52 to week number
                    week += 52
                elif year == 2017:
                    week += 104
                elif year == 2018:
                    week += 156
                #print(w*topic_idx+week)
                plot_data_all.loc[w*topic_idx + week,'num_docs'] +=  1
                
            elif timeframe == 'month':
                year = date_key.year
                month = date_key.month
                if year == 2016:   # 2015 gets weeks 1-52, subsequent years have to add multiples of 52 to week number
                    month += 12
                elif year == 2017:
                    month += 24
                elif year == 2018:
                    month += 36
                #print(w*topic_idx+week)
                #print(month, year, topic_idx, m, m*topic_idx + month)

                plot_data_all.loc[m*topic_idx + month,'num_docs'] +=  1
                 
        except:
            topic_idx = W[doc_idx].argmax()
            date_key = dates[doc_idx]
            year, week, weekday = date_key.isocalendar()

            print(year, week,'did not work')
            
    # Normalize the values (percent per time period)
    month_doc_totals = plot_data_all.groupby('date').sum()['num_docs']
    
    for i in range(len(plot_data_all)):
        d = plot_data_all.iloc[i,1]
        if month_doc_totals[d] != 0:
            plot_data_all.iloc[i,2] /= month_doc_totals[d]
    
    for topic in range(len(H)):
        foo = plot_data_all[plot_data_all['topic'] == topic]
        plt.figure()
        plt.plot(foo['date'],foo['num_docs'],label=f'Topic {topic}')
        plt.legend(loc='upper left');
    return
    #return plot_data_all

In [None]:
# write plot data to file for import in to Tableau
plot_data_all.to_csv('data/plot_data_all_month.csv')

In [None]:
# plot the trends in K Means clusters over time
# can choose month or week

def plot_topic_trend_ldaKM(groups, predict, dates):
    # groups = number of groups (int)
    # predict = array of group number
    # dates = array of dates that matches predict
    plt.figure(figsize=(17,8))
    min_date = min(dates)
    max_date = max(dates)
    min_year, min_week, weekday = min_date.isocalendar()
    max_year, max_week, weekday = max_date.isocalendar()

    plot_data_all = pd.DataFrame(columns=['topic','date','num_docs'])
    
    #timeframe = 'week'
    timeframe = 'month'
    if timeframe == 'week':
        week_start = min_week
        week_end = (max_year-min_year)*52+max_week + 1

        w = week_end - week_start
        
        for topic in range(groups):
            for week in range(week_start,week_end): 
                plot_data_all.at[w*topic+week,'topic'] = topic
                plot_data_all.at[w*topic+week,'date'] = week
        print('weeks in dataframe', week_start, week_end, w)

    
    elif timeframe == 'month':
        month_start = min_date.month
        max_month = max_date.month
        month_end = (max_year-min_year)*12+max_month + 1
        
        m = month_end - month_start
        
        for topic in range(groups):
            for month in range(month_start,month_end): 
                plot_data_all.at[m*topic+month,'topic'] = topic
                plot_data_all.at[m*topic+month,'date'] = month
        print('months in dataframe', month_start, month_end, m)
    
    plot_data_all['num_docs'] = 0
    plot_data_all = plot_data_all.astype('int')    
    print('set up dataframe for data',plot_data_all.info())

    #print(plot_data_all)
    
    for doc_idx in range(predict.shape[0]):
        try:
            topic_idx = predict[doc_idx]

            date_key = dates[doc_idx]
            if timeframe == 'week':
                year, week, weekday = date_key.isocalendar()

                if year == 2016:   # 2015 gets weeks 1-52, subsequent years have to add multiples of 52 to week number
                    week += 52
                elif year == 2017:
                    week += 104
                elif year == 2018:
                    week += 156
                #print(w*topic_idx+week)
                plot_data_all.loc[w*topic_idx + week,'num_docs'] +=  1
                
            elif timeframe == 'month':
                year = date_key.year
                month = date_key.month
                #print('month, year', month, year)
                if year == 2016:   # 2015 gets weeks 1-52, subsequent years have to add multiples of 52 to week number
                    month += 12
                elif year == 2017:
                    month += 24
                elif year == 2018:
                    month += 36
                #print(w*topic_idx+week)
                #print(month, year, topic_idx, m, m*topic_idx + month)

                plot_data_all.loc[m*topic_idx + month,'num_docs'] +=  1

        except:
            topic_idx = predict[doc_idx]
            date_key = dates[doc_idx]
            year, week, weekday = date_key.isocalendar()
            month = date_key.month
            #print(year, month,topic_idx, m*topic_idx + month,'did not work')
        
    # Normalize the values (percent per time period)
    month_doc_totals = plot_data_all.groupby('date').sum()['num_docs']
    
    for i in range(len(plot_data_all)):
        d = plot_data_all.iloc[i,1]
        if month_doc_totals[d] != 0:
            plot_data_all.iloc[i,2] /= month_doc_totals[d]

    for topic in range(groups):
        foo = plot_data_all[plot_data_all['topic'] == topic]
        plt.figure()
        plt.plot(foo['date'],foo['num_docs'],label=f'Topic {topic}')
        plt.legend(loc='upper left');
        
#    return
    return plot_data_all



## LDA

In [None]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
#tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
#tf = tf_vectorizer.fit_transform(documents)
tf_vectorizer = CountVectorizer(
            max_df = 0.95, # leave out words that occur in more than 95% of docs
            min_df = 2,    # leave out if occurs less than twice
            ngram_range=(1,3),
            stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [None]:
#print(len(tf_feature_names))

In [None]:
no_topics = 20

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=15, n_jobs=-1, learning_method='online', learning_offset=10.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

## distances in LDA output

In [None]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity

mat = lda_W

print(0, documents[0])
print(' ')

print('TESTING WITH COSINE SIMILARITY:')
dist = cosine_similarity(mat[0:1], mat)
print('cosine similarity = ',max(dist[0][1:]))
max_sim_index = np.argmax(dist[0][1:])
print(max_sim_index, documents[max_sim_index])
print(' ')
for i in range(len(dist[0])):
    if dist[0][i] >= 0.99:
        print(dist[0][i],i)
        print(documents[i])
        print(' ')

print('TESTING WITH EUCLIDEAN DISTANCE:')
dist = euclidean_distances(mat[0:1], mat)
print('euclidean distance = ',min(dist[0][1:]))
min_sim_index = np.argmin(dist[0][1:])
print(max_sim_index, documents[min_sim_index])
for i in range(len(dist[0])):
    if dist[0][i] <= 0.1:
        print(dist[0][i],i)
        print(documents[i])
        print(' ')


## K Means

In [None]:
output = {}
for c in range(1,51):
    print(f'trying {c} clusters')
    km = KMeans(n_clusters=c,random_state=10,n_init=1)
    km.fit(lda_W)
    output[c] = km.inertia_

plt.plot(output.keys(),output.values())


In [None]:
c = 15
km = KMeans(n_clusters=c,random_state=10,n_init=1)
km.fit(lda_W)
predict = km.predict(lda_W)

In [None]:
len(km.cluster_centers_)

In [None]:
# plot the trend over time for each cluster
plot_topic_trend_ldaKM(c, predict, dates)

In [None]:
# print articles closest to centroid from K-Means

from sklearn.metrics.pairwise import euclidean_distances

centers = np.array(km.cluster_centers_)

mat = lda_W

for i in range(c):
    print(f'K Means Cluster {i}')
    cent = centers[i].reshape(50,-1)
    cent = cent.T
    #print(cent)
    
    dist = euclidean_distances(cent, mat)
    min_dist = min(dist[0][1:])
    #print('min euclidean distance = ',min_dist)
    
    dist_list = []
    for pt in range(len(predict)):
        if predict[pt] == i:
            if dist[0][pt] < 1.3* min_dist:
                dist_list.append([dist[0][pt],str(dates[pt].month)+'/'+str(dates[pt].year), documents[pt]])
                #print(str(dates[pt].month)+'/'+str(dates[pt].year), documents[pt])
    sort_dist_list = sorted(dist_list, key=lambda x: x[0])
    for i in range(8):
        try:
            print(sort_dist_list[i][0], sort_dist_list[i][1],sort_dist_list[i][2])
        except:
            print('no more in list')
    
    print(" ")
