In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import guidedlda
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['marijuana','weed','pot','cannabis'])

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from gensim import corpora, models, similarities, matutils
from pprint import pprint

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Read in source data, clean and filter it

In [2]:
#read in scraped data
NYT_data = []

for i in range(1,9):
    filename = 'NYT_scrape_'+str(i)+'.pickle'
    with open(filename,'rb') as read_file:
        curr_data = pickle.load(read_file)
    for entry in curr_data:
        NYT_data.append(entry)

In [3]:
#define a function that will clean up the date formatting in the text
def fix_date(str):
    if len(str) >= 12:
        return str
    elif str.find('ago')>=0:
        return 'Feb. 18, 2019'
    else:
        return (str+', 2019')

In [4]:
df_NYT = pd.DataFrame(NYT_data, columns=['Section','Headline','Description','Date'])

df_NYT['Date'].replace({'March':'Mar.', 'April':'Apr.', 'May':'May.', 'June':'Jun.','July':'Jul.','Sept.':'Sep.'},
                      regex=True, inplace=True)

df_NYT['Date'] = df_NYT['Date'].map(lambda x: fix_date(x)) 

df_NYT['Date'] = pd.to_datetime(df_NYT.Date, format='%b. %d, %Y', errors='raise')
df_NYT.sort_values(by=['Date','Section','Headline'],inplace=True)
df_NYT.reset_index(inplace=True, drop=True)

In [5]:
df_NYT.describe()

Unnamed: 0,Section,Headline,Description,Date
count,21803.0,21803,21803.0,21803
unique,350.0,20384,20576.0,11362
top,,NEWS SUMMARY,,2007-06-23 00:00:00
freq,5342.0,167,405.0,16
first,,,,1926-11-20 00:00:00
last,,,,2019-02-18 00:00:00


In [6]:
#export data to JSON to visually examine using Tableau
#df_NYT.to_json(orient='table',path_or_buf='data.json')

In [7]:
df_NYT.head()

Unnamed: 0,Section,Headline,Description,Date
0,,MARIJUANA SMOKING IS REPORTED SAFE; Hemp Leave...,"PANAMA, Nov.",1926-11-20
1,,Government Will Ask States To Ban Growing of M...,,1931-09-15
2,,Dope Ring Specialized In Mexican Marijuana,Dope ring specializing in marijuana arrested,1933-12-02
3,,USE OF MARIJUANA SPREADING IN WEST; Poisonous ...,"DENVER, Sept. 13. -- Although as appalling in ...",1934-09-15
4,,RHODE ISLAND TO END WEED AS DRUG SOURCE; State...,"PROVIDENCE, R.I., Jan. 19. -Rhode Island autho...",1935-01-19


In [8]:
df_NYT['Full_Text'] = df_NYT['Headline']+' '+df_NYT['Description']
df_NYT.drop(columns=['Headline','Description'],inplace=True)

In [9]:
df_NYT['Work_Text'] = df_NYT['Full_Text'].str.lower()
df_NYT.head()

Unnamed: 0,Section,Date,Full_Text,Work_Text
0,,1926-11-20,MARIJUANA SMOKING IS REPORTED SAFE; Hemp Leave...,marijuana smoking is reported safe; hemp leave...
1,,1931-09-15,Government Will Ask States To Ban Growing of M...,government will ask states to ban growing of m...
2,,1933-12-02,Dope Ring Specialized In Mexican Marijuana Dop...,dope ring specialized in mexican marijuana dop...
3,,1934-09-15,USE OF MARIJUANA SPREADING IN WEST; Poisonous ...,use of marijuana spreading in west; poisonous ...
4,,1935-01-19,RHODE ISLAND TO END WEED AS DRUG SOURCE; State...,rhode island to end weed as drug source; state...


In [10]:
#keep only the articles with direct references to marijuana in the headline or description
filt1 = df_NYT['Work_Text'].str.contains('marijuana') | \
        df_NYT['Work_Text'].str.contains('weed') | \
        df_NYT['Work_Text'].str.contains('cannabis') | \
        df_NYT['Work_Text'].str.contains('pot.')

In [11]:
df_NYT = df_NYT[filt1]

In [12]:
filt2 = (df_NYT['Date'] >= '1966-01-01') & (df_NYT['Section'] != 'BRIEFING')
df_NYT = df_NYT[filt2]

In [13]:
df_NYT.reset_index(inplace=True, drop=True)
df_NYT.describe()

Unnamed: 0,Section,Date,Full_Text,Work_Text
count,4646.0,4646,4646,4646
unique,166.0,3664,4598,4597
top,,2007-06-23 00:00:00,A Critical Need For Medical Marijuana To the E...,a critical need for medical marijuana to the e...
freq,1584.0,15,9,9
first,,1966-01-09 00:00:00,,
last,,2019-02-18 00:00:00,,


In [14]:
df_NYT.head()

Unnamed: 0,Section,Date,Full_Text,Work_Text
0,,1966-01-09,Oklahoma Students Jailed; Raiders Seize Mariju...,oklahoma students jailed; raiders seize mariju...
1,,1966-01-18,Marijuana Inquiry Leads to Ousters From N.Y.U....,marijuana inquiry leads to ousters from n.y.u....
2,,1966-01-23,30 Persons Seized In Marijuana Raid On Penthou...,30 persons seized in marijuana raid on penthou...
3,,1966-02-09,9 STUDENTS JAILED IN NARCOTICS RAID; Marijuana...,9 students jailed in narcotics raid; marijuana...
4,,1966-02-10,2 Darien Youths Are Sentenced To 4 Months in M...,2 darien youths are sentenced to 4 months in m...


In [15]:
my_data = df_NYT['Work_Text'].tolist()

In [16]:
my_data[0]

'oklahoma students jailed; raiders seize marijuana norman, okla., jan. 9 (ap) fourteen persons, including 11 university of oklahoma students, were jailed tonight after a said in which the police confiscated marijuana and material opposing the vietnam war effort.'

In [17]:
#remove sentences that turn into empty strings after stopword removal: 261, 455, and 3642
my_data.pop(261)
my_data.pop(454)
my_data.pop(3640)

'marijuana disqualification 8:49 p.'

In [18]:
#my_data[2309]

### Pre-process the text data

In [19]:
#source: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [20]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(my_data))

print(data_words[:1])

[['oklahoma', 'students', 'jailed', 'raiders', 'seize', 'marijuana', 'norman', 'okla', 'jan', 'ap', 'fourteen', 'persons', 'including', 'university', 'of', 'oklahoma', 'students', 'were', 'jailed', 'tonight', 'after', 'said', 'in', 'which', 'the', 'police', 'confiscated', 'marijuana', 'and', 'material', 'opposing', 'the', 'vietnam', 'war', 'effort']]


In [21]:
#version 1
# Build the bigram model
# bigram = gensim.models.Phrases(data_words, min_count=5)

# bigram_mod = gensim.models.phrases.Phraser(bigram)

In [22]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [23]:
#version 1
# Remove Stop Words
# data_words_nostops = remove_stopwords(data_words)

# # Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)

# # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# # python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# # Do lemmatization keeping only noun, adj, vb, adv
# data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [24]:
# Remove stop words
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Remove stop words again (after lemmatization)
data_lem_nostops = remove_stopwords(data_lemmatized)

# Form Bigrams
data_words_bigrams = make_bigrams(data_lem_nostops)

In [25]:
print(data_words_bigrams[:1])

[['oklahoma', 'student', 'jail', 'raider', 'seize', 'norman', 'okla', 'jan', 'person', 'include', 'university', 'oklahoma', 'student', 'jail', 'tonight', 'say', 'police', 'confiscate', 'material', 'oppose', 'vietnam', 'war', 'effort']]


In [26]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [27]:
# View
print(corpus[:2])
print(id2word[10])
print(len(id2word))

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1)], [(11, 1), (15, 1), (17, 2), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)]]
person
11419


In [28]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[261:262]]
len(corpus[261:262][0])

17

In [29]:
#find sentences with no words

for i, sentence in enumerate(corpus):
    if len(sentence) == 0:
        print(i)

### LDA model

In [30]:
# Build LDA model

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=3, 
                                            update_every=1,
                                            random_state=25,
                                            chunksize=1000,
                                            passes=30,
                                            alpha='auto',
                                            per_word_topics=True)

In [31]:
# Print the keywords in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.017*"state" + 0.009*"medical" + 0.008*"law" + 0.007*"new" + '
  '0.007*"california" + 0.007*"legalize" + 0.007*"legal" + 0.006*"use" + '
  '0.006*"say" + 0.005*"colorado"'),
 (1,
  '0.022*"arrest" + 0.015*"drug" + 0.014*"charge" + 0.013*"police" + '
  '0.007*"man" + 0.007*"seize" + 0.007*"today" + 0.006*"yesterday" + '
  '0.006*"find" + 0.006*"possession"'),
 (2,
  '0.021*"drug" + 0.019*"use" + 0.008*"medical" + 0.008*"say" + 0.006*"smoke" '
  '+ 0.005*"new" + 0.005*"report" + 0.004*"study" + 0.004*"editor" + '
  '0.004*"may"')]


In [32]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.1645511402798

Coherence Score:  0.2962718116551411


In [33]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis_lda = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis_lda

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [34]:
#identify dominant topic for each article 

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df_NYT['Full_Text'].tolist()):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: x[1], reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df_NYT['Full_Text'].tolist())

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Article_Num', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Article_Num,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.6655,"arrest, drug, charge, police, man, seize, toda...",Oklahoma Students Jailed; Raiders Seize Mariju...
1,1,1.0,0.7052,"arrest, drug, charge, police, man, seize, toda...",Marijuana Inquiry Leads to Ousters From N.Y.U....
2,2,1.0,0.9827,"arrest, drug, charge, police, man, seize, toda...",30 Persons Seized In Marijuana Raid On Penthou...
3,3,1.0,0.8237,"arrest, drug, charge, police, man, seize, toda...",9 STUDENTS JAILED IN NARCOTICS RAID; Marijuana...
4,4,1.0,0.9263,"arrest, drug, charge, police, man, seize, toda...",2 Darien Youths Are Sentenced To 4 Months in M...
5,5,1.0,0.9852,"arrest, drug, charge, police, man, seize, toda...",3 HELD ON COAST WITH MARIJUANA; Seized Shipmen...
6,6,1.0,0.971,"arrest, drug, charge, police, man, seize, toda...",Marijuana Seized in Baltimore customs seizes 4...
7,7,2.0,0.7363,"drug, use, medical, say, smoke, new, report, s...",MARIJUANA SEEN AS MEDICAL BOON; Hopes Stirred ...
8,8,1.0,0.8909,"arrest, drug, charge, police, man, seize, toda...",Former Harvard Teacher Sent To Prison on Marij...
9,9,1.0,0.8456,"arrest, drug, charge, police, man, seize, toda...",Leary Lawyer Gives Plans For Marijuana Appeals...


In [35]:
df_dominant_topic.groupby(['Dominant_Topic']).Article_Num.count()

Dominant_Topic
0.0    1463
1.0    1484
2.0    1696
Name: Article_Num, dtype: int64

### Guided LDA

In [36]:
#define seeds for Crime, Business, and Health topics
seed_topic_list = [['arrest', 'charge', 'police', 'sentence', 'crime','underground','smuggle','seize','prosecute'],
                   ['investment', 'company', 'industry', 'tax', 'market', 'startup', 'entrepreneur', 'recreational','business'],
                   ['medicine', 'pain', 'medical', 'research', 'scientist', 'prescribe','health','cancer']]

In [62]:
GLDA_model = guidedlda.GuidedLDA(n_topics=4, n_iter=100, random_state=25, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[list(id2word.keys())[list(id2word.values()).index(word)]] = t_id

vocab = []
for i in range(len(id2word)):
    vocab.append(id2word[i])       

strings = [' '.join(text) for text in texts]

In [63]:
vectorizer = CountVectorizer(vocabulary=vocab)
vectorizer.fit(strings)
X = vectorizer.transform(strings)

In [64]:
X.shape

(4643, 11419)

In [65]:
GLDA_model.fit(X, seed_topics=seed_topics, seed_confidence=0.5)

n_top_words = 15
topic_word = GLDA_model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: arrest drug charge police seize today yesterday man find possession case say hold year_old raid
Topic 1: smoke new today grow say high drug find city legal get business make spot california
Topic 2: use drug medical state law say new legalize legal editor may federal would study make
Topic 3: drug use law say state today medical new smoke report arrest possession penalty call city


In [41]:
GLDA_output = []
for doc in GLDA_model.doc_topic_:
    GLDA_output.append(np.argmax(doc))

df_GLDA = pd.Series(GLDA_output).to_frame('Guided LDA')

### LSA

In [42]:
# Build LSA model

LSA_model = gensim.models.lsimodel.LsiModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10
                                           )

In [43]:
# Print the keywords in the topics
pprint(LSA_model.print_topics())
doc_LSA = LSA_model[corpus]
labels = [e[:30]+"..." for e in df_NYT['Full_Text'].tolist()]

[(0,
  '0.767*"drug" + 0.188*"use" + 0.156*"treatment" + 0.152*"alcohol" + '
  '0.124*"year" + 0.121*"welfare" + 0.098*"state" + 0.096*"child" + '
  '0.084*"care" + 0.077*"arrest"'),
 (1,
  '0.458*"use" + 0.296*"arrest" + 0.267*"state" + 0.228*"say" + '
  '0.188*"medical" + -0.188*"drug" + 0.173*"charge" + 0.173*"police" + '
  '0.165*"today" + -0.141*"treatment"'),
 (2,
  '-0.557*"arrest" + 0.541*"use" + -0.293*"charge" + -0.258*"police" + '
  '0.220*"medical" + 0.119*"state" + -0.104*"man" + -0.104*"yesterday" + '
  '-0.103*"seize" + 0.093*"law"'),
 (3,
  '-0.604*"state" + 0.508*"use" + -0.269*"law" + -0.247*"medical" + '
  '0.180*"drug" + -0.113*"new" + -0.108*"legalize" + 0.084*"arrest" + '
  '-0.083*"california" + 0.077*"charge"'),
 (4,
  '0.401*"medical" + -0.398*"say" + 0.390*"arrest" + -0.294*"drug" + '
  '0.240*"use" + -0.161*"today" + 0.151*"treatment" + 0.127*"alcohol" + '
  '-0.124*"state" + 0.119*"welfare"'),
 (5,
  '-0.526*"say" + 0.364*"drug" + 0.269*"medical" + 0.192*"la

In [44]:
num_topics = len(LSA_model.print_topics())
labels = np.asarray(['PC '+str(i) for i in range(num_topics)])

In [45]:
df_LSA = pd.DataFrame(columns=labels)

for i,row in enumerate(doc_LSA):
    fixed_row = []
    for k in range(num_topics):
        try:
            fixed_row.append(doc_LSA[i][k][1])
        except: print('i=',i,' row=',row)
    df_row_to_add = pd.DataFrame([fixed_row], columns=labels)
    df_LSA = df_LSA.append(df_row_to_add, ignore_index=True) 

In [46]:
df_LSA[df_LSA['PC 0']>50]

Unnamed: 0,PC 0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8,PC 9
2309,67.092986,-19.511069,0.727931,-3.748688,7.324954,-5.861136,-2.405971,1.830421,0.329802,1.065923


In [47]:
#sns.pairplot(df_LSA)

In [48]:
km = KMeans(n_clusters=5,random_state=10)

In [49]:
km.fit(df_LSA)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)

In [50]:
km.cluster_centers_

array([[ 2.06957547e-01,  3.10135472e-01, -1.90711202e-02,
        -2.46936755e-01, -4.37953201e-02, -1.46506335e-01,
         9.65963130e-02,  1.24729267e-01, -8.81192761e-02,
        -2.97878267e-02],
       [ 6.70929856e+01, -1.95110685e+01,  7.27931290e-01,
        -3.74868753e+00,  7.32495393e+00, -5.86113593e+00,
        -2.40597125e+00,  1.83042118e+00,  3.29802323e-01,
         1.06592341e+00],
       [ 7.62448611e-01,  9.53984710e-01,  7.60308044e-01,
         4.06540341e-01,  1.95422000e-01, -4.91249226e-02,
        -1.23824355e-01,  3.59687947e-02,  1.66832121e-02,
         2.81392509e-02],
       [ 5.45845600e-01,  8.18229193e-01, -1.11269382e+00,
         9.11582334e-02,  4.10566131e-01,  2.00480579e-02,
        -1.02716129e-01, -1.81230223e-01,  1.84153819e-01,
        -6.13459620e-02],
       [ 1.36941812e+00,  2.23958867e-01, -1.53461843e-01,
         4.59802917e-02, -4.92802947e-01,  3.52045288e-01,
         2.09659574e-01, -5.38309074e-02, -6.16975085e-02,
        -2.

In [51]:
len(km.labels_)

4643

In [52]:
df_LSA = pd.Series(km.labels_).to_frame('LSA+KMeans')

### Transfer all outputs to Tableau for visualization

In [66]:
#LDA
df_output = df_NYT.join(df_dominant_topic[['Dominant_Topic','Topic_Perc_Contrib']])
df_output.rename(columns={'Dominant_Topic':'LDA'}, inplace=True)

In [67]:
#Guided LDA
df_output = df_output.join(df_GLDA)

In [68]:
#LSA + KMeans Clustering
df_output = df_output.join(df_LSA)

In [69]:
df_output.drop(columns=['Work_Text'],inplace=True)
df_output.head()

Unnamed: 0,Section,Date,Full_Text,LDA,Topic_Perc_Contrib,Guided LDA,LSA+KMeans
0,,1966-01-09,Oklahoma Students Jailed; Raiders Seize Mariju...,1.0,0.6655,3.0,0.0
1,,1966-01-18,Marijuana Inquiry Leads to Ousters From N.Y.U....,1.0,0.7052,0.0,2.0
2,,1966-01-23,30 Persons Seized In Marijuana Raid On Penthou...,1.0,0.9827,0.0,0.0
3,,1966-02-09,9 STUDENTS JAILED IN NARCOTICS RAID; Marijuana...,1.0,0.8237,0.0,0.0
4,,1966-02-10,2 Darien Youths Are Sentenced To 4 Months in M...,1.0,0.9263,0.0,3.0


In [70]:
df_output.to_csv(path_or_buf ='df_output.csv', index=False)