In [29]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_lg

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

import pyLDAvis.gensim
from collections import defaultdict



In [3]:
# read water and diarrhea articles
df=pd.read_csv('articles_diarrhea.csv')

In [6]:
news_df=df.loc[~df['text'].isna()]
news_df.reset_index(inplace=True,drop=True)

In [7]:
# removing everything except alphabets`
news_df['clean_doc'] = news_df['text'].str.replace("[^a-zA-Z#]", " ")

# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['clean_doc'] = news_df['text'].str.replace("[^a-zA-Z#]", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))


In [41]:
nlp= spacy.load("en")

# My list of stop words.
stop_list = ["Mrs.","Ms.","say","'s","Mr.",'Prof.','\n']

# Updates spaCy's default stop words list with my additional words. 
nlp.Defaults.stop_words.update(stop_list)

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [42]:
def lemmatizer(doc):
   # take only certain position of sentences
    doc2=[]
    for token in doc:
        if ((token.pos_ != 'PRON') and (token.pos_!= 'SCONJ') and (token.is_currency!=True)  and (token.like_num!=True) and (token.ent_type_ in ['','LAW','PRODUCT','EVENT'] ) and not token.is_digit and (token.pos_ != 'CCONJ') and (token.pos_ != 'DET') and (token.pos_ != 'ADP') and (token.pos_ != 'PART') and (token.pos_ != 'ADV') and (token.pos_ != 'AUX') ):
            doc2.append(token.lemma_.lower())
    doc2 = u' '.join(doc2)
    return nlp.make_doc(doc2)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [43]:
doc_list = []
# Iterates through each article in the corpus.
for doc in tqdm(newsdoc):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)   

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm(newsdoc):


HBox(children=(FloatProgress(value=0.0, max=10940.0), HTML(value='')))




In [45]:
frequency = defaultdict(int)
for text in doc_list:
    for token in text:
        frequency[token] += 1

doc_list = [
    [token for token in text if (frequency[token] > 1) and not('\n' in token) and not ('Prof.' in token)]
    for text in doc_list
]

pprint(doc_list[0])

['listen',
 'article',
 'professor',
 'director',
 'health',
 'development',
 'geographer',
 'research',
 'interest',
 'underscore',
 'sanitation',
 'issue',
 'cost',
 'population',
 'defecate',
 'equate',
 '5.4million',
 'population',
 'estimate',
 'accord',
 'prof',
 'mariwah',
 'open',
 'defecation',
 'situation',
 'people',
 'choose',
 'ease',
 'beach',
 'gutter',
 'house',
 'consistent',
 'habit',
 'ask',
 'description',
 'decide',
 'ease',
 'nearby',
 's',
 'develop',
 'stomach',
 'upset',
 'travel',
 'open',
 'defecation',
 'count',
 'calculate',
 'people',
 'practice',
 'open',
 'defecation',
 'consistent',
 'habit',
 'person',
 'happen',
 'individual',
 'alternative',
 'moment',
 'explain',
 'cause',
 'open',
 'defecation',
 'cultural',
 'social',
 'spiritual',
 'economic',
 'dimension',
 'system',
 'toilet',
 'build',
 'outskirt',
 'town',
 'people',
 'build',
 'house',
 'incorporate',
 'toilet',
 'facility',
 'add',
 'jurisdiction',
 'people',
 'defecate',
 'provide',
 'toil

In [46]:
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [47]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=5, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [48]:
pprint(lda_model.print_topics(num_words=20))

[(0,
  '0.020*"people" + 0.014*"water" + 0.014*"cholera" + 0.012*"child" + '
  '0.009*"health" + 0.009*"case" + 0.008*"camp" + 0.007*"disease" + '
  '0.007*"need" + 0.007*"area" + 0.006*"outbreak" + 0.006*"food" + '
  '0.006*"country" + 0.006*"refugee" + 0.006*"aid" + 0.005*"humanitarian" + '
  '0.005*"include" + 0.005*"diarrhoea" + 0.005*"government" + 0.005*"provide"'),
 (1,
  '0.054*"death" + 0.041*"disease" + 0.025*"age" + 0.022*"cancer" + '
  '0.019*"cause" + 0.014*"professor" + 0.014*"rate" + 0.013*"man" + '
  '0.012*"die" + 0.011*"suicide" + 0.011*"people" + 0.010*"killer" + '
  '0.010*"circulatory" + 0.010*"life" + 0.010*"infectious" + 0.009*"group" + '
  '0.008*"young" + 0.008*"new" + 0.008*"injury" + 0.008*"account"'),
 (2,
  '0.018*"good" + 0.012*"beach" + 0.009*"use" + 0.006*"poor" + 0.005*"include" '
  '+ 0.005*"site" + 0.005*"health" + 0.005*"ship" + 0.005*"time" + '
  '0.005*"water" + 0.004*"study" + 0.004*"swimming" + 0.004*"food" + '
  '0.004*"datum" + 0.003*"report" +

In [49]:
pyLDAvis.enable_notebook()

In [50]:
pyLDAvis.gensim.prepare(lda_model, corpus, words)