# Topic Modeling using LDA

In [25]:
# libraries

import pandas as pd
from wordcloud import WordCloud
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2019)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddharth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
news = pd.read_csv("abcnews-date-text.csv")
news = news.drop(columns = "publish_date")
news['index'] = news.index
news[100:120]

Unnamed: 0,headline_text,index
100,more women urged to become councillors,100
101,most highly educated live in nsw wa,101
102,mp raises hospital concerns in parliament,102
103,mp rejects ambulance levy claims,103
104,mugabe to touch down in paris for summit,104
105,national gallery gets all clear after,105
106,nato gives green light to defend turkey,106
107,nca defends aboriginal tent embassy raid,107
108,new zealand imposes visa entry for zimbabwe,108
109,no side effects for new whooping cough vaccine,109


In [77]:
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [78]:
processed_news = news['headline_text'].map(preprocess)

In [79]:
processed_news[100:120]

100                        [women, urge, councillors]
101                           [highly, educate, live]
102            [raise, hospital, concern, parliament]
103                  [reject, ambulance, levy, claim]
104                    [mugabe, touch, paris, summit]
105                   [national, gallery, get, clear]
106        [nato, give, green, light, defend, turkey]
107         [defend, aboriginal, tent, embassy, raid]
108          [zealand, impose, visa, entry, zimbabwe]
109                   [effect, whoop, cough, vaccine]
110                          [govt, hold, vegetation]
111              [defend, claim, run, race, campaign]
112                         [pledge, drought, relief]
113    [govt, boost, nurse, number, overseas, intake]
114        [koreans, seek, asylum, japanese, embassy]
115                          [nurse, student, intake]
116             [brother, time, say, ganguly, senior]
117                          [omodei, stay, politics]
118           [onesteel, inv

In [61]:
processed_news[100:120]

100                       [women, urg, councillor]
101                             [high, educ, live]
102            [rais, hospit, concern, parliament]
103                   [reject, ambul, levi, claim]
104                   [mugab, touch, pari, summit]
105                  [nation, galleri, get, clear]
106     [nato, give, green, light, defend, turkey]
107        [defend, aborigin, tent, embassi, raid]
108         [zealand, impos, visa, entri, zimbabw]
109                 [effect, whoop, cough, vaccin]
110                            [govt, hold, veget]
111           [defend, claim, run, race, campaign]
112                       [pledg, drought, relief]
113    [govt, boost, nurs, number, oversea, intak]
114       [korean, seek, asylum, japanes, embassi]
115                         [nurs, student, intak]
116          [brother, time, say, ganguli, senior]
117                          [omodei, stay, polit]
118         [onesteel, invest, whyalla, steelwork]
119        [opposit, urg, help,

In [71]:
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
stems = [stemmer.stem(plural) for plural in original_words]
lemmas = [WordNetLemmatizer().lemmatize(plural, pos='v') for plural in original_words]
lemma_stem = [stemmer.stem(WordNetLemmatizer().lemmatize(plural, pos='v')) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': stems, 'lemma': lemmas, 'apt output': lemma_stem})

Unnamed: 0,original word,stemmed,lemma,apt output
0,caresses,caress,caress,caress
1,flies,fli,fly,fli
2,dies,die,die,die
3,mules,mule,mules,mule
4,denied,deni,deny,deni
5,died,die,die,die
6,agreed,agre,agree,agre
7,owned,own,own,own
8,humbled,humbl,humble,humbl
9,sized,size,size,size
