Implementation of Topic Model for NLP using LDA
following tutorial of Susan Li at https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [26]:
import sys
!{sys.executable} -m pip install gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/b4/fb/c0cefcecf82b445ff2a714935db5b475a25202d6b63241c7e95ca004136a/gensim-3.7.3-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.7MB)
[K    100% |████████████████████████████████| 24.7MB 24kB/s  eta 0:00:01    75% |████████████████████████        | 18.5MB 13.1MB/s eta 0:00:01
Collecting smart-open>=1.7.0 (from gensim)
  Downloading https://files.pythonhosted.org/packages/bf/ba/7eaf3c0dbe601c43d88e449dcd7b61d385fe07c0167163f63f58ece7c1b5/smart_open-1.8.3.tar.gz (60kB)
[K    100% |████████████████████████████████| 61kB 2.3MB/s ta 0:00:011
Collecting boto3 (from smart-open>=1.7.0->gensim)
  Downloading https://files.pythonhosted.org/packages/20/9c/7111470f07700a6b06305943fc7521e49d9669dbda0c1862c4658130f235/boto3-1.9.146-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 3.7MB/s eta 0:00:01
[?25hCollecting jmespath<1.0.0,

In [50]:
#importing all needed libraries
import pandas as pd
#import gensim
#from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as STOPWORDS
from nltk.stem import WordNetLemmatizer as wnLemmatizer
#from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
#import nltk
#nltk.download('wordnet')

In [7]:
# if running this implementation with shrinked data
# this cell is not needed
#data = pd.read_csv('abcnews-date-text.csv',error_bad_lines=False)
#print(data.shape)
#data = data[:500000]
#data.to_csv('abcnews-short.csv')

(1103663, 2)


In [62]:
data = pd.read_csv('abcnews-short.csv',error_bad_lines=False)
documents = data[['headline_text']]
documents.columns = ['headline']
print(len(documents))
print(documents[:5])

500000
                                            headline
0  aba decides against community broadcasting lic...
1     act fire witnesses must be aware of defamation
2     a g calls for infrastructure protection summit
3           air nz staff in aust strike for pay rise
4      air nz strike to affect australian travellers


In [63]:
#function to lemmatize and stemming words
def lem_stem(text):
    return nltk.stem.WordNetLemmatizer().lemmatize(text,pos='v')

#function of preprocessing
#filter stopwords and words less than 3 characters, then lem&stem
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text,min_len=4):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lem_stem(token))
    return result

In [64]:
doc_sample = documents.iat[4310,0]
print('original document: ',doc_sample)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document:  rain helps dampen bushfires


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfires']


In [65]:
processed_docs = documents['headline'].map(preprocess)
processed_docs[:10]

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline, dtype: object

In [66]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 community
2 decide
3 licence
4 aware
5 defamation
6 witness
7 call
8 infrastructure
9 protection
10 summit


In [68]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [69]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [70]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(bow_corpus[4310])
#and translate back to word with the dictionary
for i in range(len(bow_corpus[4310])):
    print(dictionary[bow_corpus[4310][i][0]])

[(110, 1), (480, 1), (946, 1), (4132, 1)]
help
rain
bushfires
dampen


In [58]:
#writing my own function to preprocess text
def preProcess(originalString,lowerCase=True,minLength=4):
    if lowerCase:
        originalString = originalString.lower()
    words = []
    for word in originalString.split(' '):
        if len(word) >= minLength:
            if word not in STOPWORDS:
                words.append(wnLemmatizer().lemmatize(word,pos='v'))
    return words

print('testing function:\n',preProcess(doc_sample))

testing function:
 ['rain', 'help', 'dampen', 'bushfires']


In [59]:
tokenized_docs = documents['headline_text'].map(preProcess)
tokenized_docs[:10]

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline_text, dtype: object

In [None]:
#function to create a dictionary