In [7]:
import pickle
import datetime
from dateutil.parser import parse

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
# read data from pickle file
with open ('all_data_desk_nohole.pkl', 'rb') as fp:
    df = pickle.load(fp)
    
df.shape

(274606, 4)

In [9]:
# check to see that all months in 2018 are represented
df['date'][df['date'].apply(lambda x: x.year) == 2018].apply(lambda x: x.month).unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [10]:
df.dropna(inplace=True)

In [11]:
# there's some bad data in the snippets.  Drop the rows that have NaN to clean this up
df.dropna(inplace=True)

# choose which news desks to include
news_desk_use = ['Business',
                 'Foreign',
                 'NewsDesk',
                 'National',
                 'Politics',
                 'U.S.',
                 'U.S. / Politics',
                 'U.S. / Election 2016',
                 'Washington',
                 'World / Europe',
                 'World / Middle East',
                 'World / Asia Pacific',
                 'World / Africa',
                 'World / Americas']
df2 = pd.DataFrame()
for desk in news_desk_use:
    topic = df['news_desk'] == desk
    df2 = pd.concat([df2,df[topic]])
    print(len(df2))

df2['head_snip'] = df2['headline'] + ' '+ df2['snippet']

dates = list(df2['date'].copy())

documents = list(df2['head_snip'].copy())

df2.shape

# write data to pickle file
#with open('df2.pkl', 'wb') as fp:
#    pickle.dump(df2, fp)
    

26344
52680
53028
71798
75208
77468
78714
78983
82566
83233
83732
84202
84329
84582


(84582, 5)

In [12]:
# write date and document to csv
df2['date'].to_csv('date.csv')
df2['head_snip'].to_csv('documents.csv')

In [None]:
with open ('dates.pkl', 'rb') as fp:
    df2 = pickle.load(fp)
with open ('documents.pkl', 'rb') as fp:
    df2 = pickle.load(fp)

## LDA

In [13]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
#tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
#tf = tf_vectorizer.fit_transform(documents)
tf_vectorizer = CountVectorizer(
            max_df = 0.95, # leave out words that occur in more than 95% of docs
            min_df = 2,    # leave out if occurs less than twice
            ngram_range=(1,3),
            stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [14]:
no_topics = 20
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=15, n_jobs=-1, 
#                                      learning_method='online', learning_offset=10.,random_state=0).fit(tf)
                                      learning_method='online', learning_offset=10.,random_state=0)

In [None]:
# write model to file (or read from file)

write = True # True if write to file
             # False if read from file
if write == True:
    # write data to pickle file
    with open('lda_model.pkl', 'wb') as fp:
        pickle.dump(lda_model, fp)

elif write == False:
    # read data from pickle file
    with open('lda_model.pkl', 'rb') as fp:
        lda_model = pickle.load(fp)


In [15]:
lda_model_fit = lda_model.fit(tf)



KeyboardInterrupt: 

In [None]:
# write model to file (or read from file)

write = True # True if write to file
             # False if read from file
if write == True:
    # write data to pickle file
    with open('lda_model_fit.pkl', 'wb') as fp:
        pickle.dump(lda_model_fit, fp)

elif write == False:
    # read data from pickle file
    with open('lda_model_fit.pkl', 'rb') as fp:
        lda_model_fit = pickle.load(fp)


In [None]:
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_