In [1]:
import datetime as dt
import os
import sys

import numpy as np
import pandas as pd
from scipy import interp
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_curve, average_precision_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
import sqlalchemy as sa
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('../')
from utilities import sql_utils as su
from utilities import model_eval_utils as meu

DWH = os.getenv('MIMIC_DWH')
engine = create_engine(DWH)

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.set_option('display.float_format', lambda x: '%.3f' % x)

  """)


LDA Implementation Tutorial: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

LDA Intuition Guide: https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-latent-dirichlet-allocation-437c81220158

LDA Description: http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/

In [7]:
df = pd.read_csv('~/git/GeorgiaTech/cse6250/bigbox/project/data/tutorial/abcnews-date-text.csv', error_bad_lines=False);

In [8]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [11]:
data_text = df[['headline_text']]
data_text['index'] = data_text.index

In [12]:
data_text.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [13]:
documents = data_text

In [14]:
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [15]:
print(len(documents))
print(documents[:5])

1103665
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


## Data Preprocessing

1. Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
2. Words that have fewer than 3 characters are removed.
3. All stopwords are removed.
4. Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
5. Words are stemmed — words are reduced to their root form.

### Loading gensim and nltk libraries

In [39]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# import nltk.stem as stemmer
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

stemmer = SnowballStemmer('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/VincentLa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
def lemmatize_stemming(text):
    """
    Lemmatize: lemmatized — words in third person are changed to first person
    
    Verbs in past and future tenses are changed into present.
    """
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [41]:
def preprocess(text):
    """
    Preprocess Text:
    
    Remove words in "STOPWORDS" and remove words 3 letters or less
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [42]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')

words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


Preprocess the headline text, saving the results as 'processed_docs'

In [43]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

## Bag of Words on the Data set
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [44]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


### Gensim filter_extremes
Filter out tokens that appear in

1. less than 15 documents (absolute number) or
2. more than 0.5 documents (fraction of total corpus size, not absolute number).
3. after the above two steps, keep only the first 100000 most frequent tokens.

In [46]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

## Gensim doc2bow (Bag of Words)
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [47]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(76, 1), (112, 1), (483, 1), (4014, 1)]

In [48]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 76 ("bushfir") appears 1 time.
Word 112 ("help") appears 1 time.
Word 483 ("rain") appears 1 time.
Word 4014 ("dampen") appears 1 time.


## TF IDF 
Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [49]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5892908644709983),
 (1, 0.38929657403503015),
 (2, 0.4964985198530063),
 (3, 0.5046520328695662)]


## Running LDA using Bag of Words
Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [51]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=10,
                                       id2word=dictionary,
                                       passes=2,
                                       workers=2)

In [58]:
type(lda_model)

gensim.models.ldamulticore.LdaMulticore

For each topic, we will explore the words occuring in that topic and its relative weight.

In [52]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.035*"charg" + 0.033*"court" + 0.025*"murder" + 0.021*"face" + 0.017*"accus" + 0.016*"tasmania" + 0.015*"alleg" + 0.014*"claim" + 0.014*"trial" + 0.012*"drug"
Topic: 1 
Words: 0.026*"elect" + 0.022*"australia" + 0.022*"south" + 0.012*"break" + 0.012*"deal" + 0.012*"leagu" + 0.011*"take" + 0.010*"win" + 0.010*"west" + 0.010*"vote"
Topic: 2 
Words: 0.023*"govern" + 0.023*"canberra" + 0.022*"test" + 0.016*"water" + 0.015*"price" + 0.012*"guilti" + 0.011*"find" + 0.010*"drum" + 0.009*"river" + 0.009*"shark"
Topic: 3 
Words: 0.021*"chang" + 0.020*"open" + 0.019*"market" + 0.016*"australian" + 0.015*"school" + 0.013*"peopl" + 0.013*"share" + 0.012*"final" + 0.011*"flood" + 0.011*"bank"
Topic: 4 
Words: 0.021*"north" + 0.019*"coast" + 0.019*"warn" + 0.016*"live" + 0.015*"rural" + 0.013*"health" + 0.012*"gold" + 0.012*"concern" + 0.011*"govern" + 0.011*"worker"
Topic: 5 
Words: 0.036*"year" + 0.018*"adelaid" + 0.017*"famili" + 0.015*"child" + 0.014*"turnbul" + 0.012*"sentenc"

## Running LDA using TF-IDF


In [53]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.014*"interview" + 0.009*"leagu" + 0.008*"final" + 0.007*"world" + 0.007*"australia" + 0.006*"john" + 0.005*"beat" + 0.005*"domest" + 0.005*"coach" + 0.005*"cricket"
Topic: 1 Word: 0.015*"crash" + 0.009*"die" + 0.008*"miss" + 0.008*"search" + 0.008*"polic" + 0.008*"dead" + 0.006*"kill" + 0.006*"truck" + 0.006*"peter" + 0.006*"malcolm"
Topic: 2 Word: 0.020*"charg" + 0.016*"murder" + 0.015*"polic" + 0.013*"court" + 0.011*"alleg" + 0.011*"jail" + 0.010*"woman" + 0.009*"arrest" + 0.009*"death" + 0.009*"assault"
Topic: 3 Word: 0.022*"countri" + 0.020*"trump" + 0.020*"hour" + 0.010*"donald" + 0.010*"podcast" + 0.009*"sport" + 0.007*"ash" + 0.006*"sexual" + 0.005*"univers" + 0.004*"farm"
Topic: 4 Word: 0.007*"dollar" + 0.007*"michael" + 0.006*"stori" + 0.006*"decemb" + 0.005*"export" + 0.005*"andrew" + 0.005*"australian" + 0.005*"drive" + 0.005*"drought" + 0.004*"coal"
Topic: 5 Word: 0.013*"coast" + 0.010*"queensland" + 0.008*"gold" + 0.007*"tasmania" + 0.006*"juli" + 0.006*"w

## Performance evaluation by classifying sample document using LDA Bag of Words model

In [54]:
processed_docs[4310]

['rain', 'help', 'dampen', 'bushfir']

In [55]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6192572712898254	 
Topic: 0.023*"govern" + 0.023*"canberra" + 0.022*"test" + 0.016*"water" + 0.015*"price" + 0.012*"guilti" + 0.011*"find" + 0.010*"drum" + 0.009*"river" + 0.009*"shark"

Score: 0.2207210510969162	 
Topic: 0.021*"chang" + 0.020*"open" + 0.019*"market" + 0.016*"australian" + 0.015*"school" + 0.013*"peopl" + 0.013*"share" + 0.012*"final" + 0.011*"flood" + 0.011*"bank"

Score: 0.02000955119729042	 
Topic: 0.036*"year" + 0.018*"adelaid" + 0.017*"famili" + 0.015*"child" + 0.014*"turnbul" + 0.012*"sentenc" + 0.011*"week" + 0.010*"prison" + 0.010*"john" + 0.010*"say"

Score: 0.02000592276453972	 
Topic: 0.026*"elect" + 0.022*"australia" + 0.022*"south" + 0.012*"break" + 0.012*"deal" + 0.012*"leagu" + 0.011*"take" + 0.010*"win" + 0.010*"west" + 0.010*"vote"

Score: 0.020003166049718857	 
Topic: 0.020*"plan" + 0.015*"council" + 0.015*"countri" + 0.015*"state" + 0.013*"perth" + 0.012*"hour" + 0.012*"indigen" + 0.010*"communiti" + 0.010*"fund" + 0.010*"busi"

Score: 0.02

Our test document has the highest probability to be part of the topic that our model assigned, which is the accurate classification.

## Performance evaluation by classifying sample document using LDA TF-IDF model.

In [56]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6069387197494507	 
Topic: 0.011*"weather" + 0.008*"flood" + 0.007*"friday" + 0.007*"australia" + 0.006*"outback" + 0.006*"zealand" + 0.005*"wrap" + 0.005*"mark" + 0.005*"rain" + 0.005*"march"

Score: 0.23303067684173584	 
Topic: 0.014*"interview" + 0.009*"leagu" + 0.008*"final" + 0.007*"world" + 0.007*"australia" + 0.006*"john" + 0.005*"beat" + 0.005*"domest" + 0.005*"coach" + 0.005*"cricket"

Score: 0.02000514045357704	 
Topic: 0.022*"countri" + 0.020*"trump" + 0.020*"hour" + 0.010*"donald" + 0.010*"podcast" + 0.009*"sport" + 0.007*"ash" + 0.006*"sexual" + 0.005*"univers" + 0.004*"farm"

Score: 0.020004868507385254	 
Topic: 0.015*"crash" + 0.009*"die" + 0.008*"miss" + 0.008*"search" + 0.008*"polic" + 0.008*"dead" + 0.006*"kill" + 0.006*"truck" + 0.006*"peter" + 0.006*"malcolm"

Score: 0.020004410296678543	 
Topic: 0.007*"dollar" + 0.007*"michael" + 0.006*"stori" + 0.006*"decemb" + 0.005*"export" + 0.005*"andrew" + 0.005*"australian" + 0.005*"drive" + 0.005*"drought" + 0.004*

## Testing model on unseen document


In [57]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3500000536441803	 Topic: 0.026*"elect" + 0.022*"australia" + 0.022*"south" + 0.012*"break" + 0.012*"deal"
Score: 0.1833333820104599	 Topic: 0.021*"north" + 0.019*"coast" + 0.019*"warn" + 0.016*"live" + 0.015*"rural"
Score: 0.1833333522081375	 Topic: 0.021*"chang" + 0.020*"open" + 0.019*"market" + 0.016*"australian" + 0.015*"school"
Score: 0.18333320319652557	 Topic: 0.023*"kill" + 0.022*"nation" + 0.022*"australian" + 0.019*"attack" + 0.015*"dead"
Score: 0.01666666753590107	 Topic: 0.035*"charg" + 0.033*"court" + 0.025*"murder" + 0.021*"face" + 0.017*"accus"
Score: 0.01666666753590107	 Topic: 0.023*"govern" + 0.023*"canberra" + 0.022*"test" + 0.016*"water" + 0.015*"price"
Score: 0.01666666753590107	 Topic: 0.036*"year" + 0.018*"adelaid" + 0.017*"famili" + 0.015*"child" + 0.014*"turnbul"
Score: 0.01666666753590107	 Topic: 0.020*"plan" + 0.015*"council" + 0.015*"countri" + 0.015*"state" + 0.013*"perth"
Score: 0.01666666753590107	 Topic: 0.036*"trump" + 0.028*"queensland" + 0.021