In [45]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.test.utils import datapath
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_lg

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

import pyLDAvis.gensim
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD


In [2]:
df=pd.read_csv('all_adi_text.csv')

In [3]:
news_df=df.loc[~df['text'].isna()]
news_df.reset_index(inplace=True,drop=True)

In [4]:
# removing everything except alphabets`
news_df['clean_doc'] = news_df['text'].str.replace("[^a-zA-Z#]", " ")

# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['clean_doc'] = news_df['text'].str.replace("[^a-zA-Z#]", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))


In [26]:
nlp= spacy.load("en")

# My list of stop words.
stop_list = ["Mrs.","Ms.","say","'s","Mr.","Prof.",'\n','\n\n','country','region','people','area','water','person']

# Updates spaCy's default stop words list with my additional words. 
nlp.Defaults.stop_words.update(stop_list)

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [27]:
def lemmatizer(doc):
    # take only certain position of sentences
    doc2=[]
    for token in doc:
        if ((token.pos_ != 'PRON') and (token.pos_!= 'SCONJ') and (token.is_currency!=True)  and (token.like_num!=True) and (token.like_url!=True) and (token.ent_type_ in ['','LAW','PRODUCT','EVENT'] ) and not token.is_digit and (token.pos_ != 'CCONJ') and (token.pos_ != 'DET') and (token.pos_ != 'ADP') and (token.pos_ != 'PART') and (token.pos_ != 'ADV') and (token.pos_ != 'AUX') ):
            doc2.append(token.lemma_.lower())
    doc2 = u' '.join(doc2)
    return nlp.make_doc(doc2)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [28]:
newsdoc=news_df['clean_doc'].unique().tolist()

In [29]:
doc_list = []
# Iterates through each article in the corpus.
for doc in tqdm(newsdoc):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)   

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm(newsdoc):


HBox(children=(FloatProgress(value=0.0, max=630.0), HTML(value='')))




In [30]:
# remove words that only appears once
frequency = defaultdict(int)
for text in doc_list:
    for token in text:
        frequency[token] += 1

doc_list = [
    [token for token in text if (frequency[token] > 1) and not('\n' in token)]
    for text in doc_list
]

pprint(doc_list[0])

['writer',
 'democratic',
 'republic',
 'world',
 'conflict',
 'nation',
 'reason',
 'break',
 'include',
 'conflict',
 'mineral',
 'food',
 'resource',
 'collapse',
 'infrastructure',
 'fighting',
 'wet',
 'nation',
 'majority',
 'rural',
 'access',
 'sanitary',
 'lack',
 'infrastructure',
 'fact',
 'study',
 'carry',
 'find',
 'die',
 'violence',
 'malnutrition',
 'problem',
 'associate',
 'lack',
 'state',
 'utility',
 'ability',
 'improve',
 'pumping',
 'system',
 'lack',
 'fund',
 'undertake',
 'project',
 'continue',
 'pump',
 'needy',
 'rusty',
 'decay',
 'pipe',
 'accord',
 'irin',
 'percent',
 'urban',
 'receive',
 'state',
 'utility',
 'mean',
 'significant',
 'citizen',
 'receive',
 'town',
 'village',
 'result',
 'local',
 'find',
 'option',
 'satisfy',
 'need',
 'town',
 'northwestern',
 'district',
 'local',
 'able',
 'local',
 'stream',
 'pond',
 'pipe',
 'come',
 'state',
 'utility',
 'local',
 'stream',
 'use',
 'drinking',
 'spring',
 'natural',
 'associate',
 'chemic

In [31]:
# Creates, which is a mapping of word IDs to words, a dictionary
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [41]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=7, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [42]:
pprint(lda_model.print_topics(num_words=10))

[(0,
  '0.015*"project" + 0.009*"supply" + 0.008*"high" + 0.008*"drinking" + '
  '0.008*"irrigation" + 0.008*"plant" + 0.007*"resource" + 0.007*"use" + '
  '0.006*"wastewater" + 0.006*"treatment"'),
 (1,
  '0.009*"drought" + 0.008*"community" + 0.008*"government" + 0.007*"flood" + '
  '0.007*"access" + 0.007*"clean" + 0.006*"rain" + 0.006*"affect" + '
  '0.005*"need" + 0.005*"supply"'),
 (2,
  '0.021*"food" + 0.012*"climate" + 0.011*"child" + 0.010*"disaster" + '
  '0.010*"change" + 0.010*"drought" + 0.007*"need" + 0.006*"risk" + '
  '0.006*"global" + 0.005*"conflict"'),
 (3,
  '0.018*"sanitation" + 0.013*"access" + 0.010*"supply" + 0.009*"service" + '
  '0.009*"government" + 0.007*"provide" + 0.007*"project" + 0.007*"improve" + '
  '0.007*"support" + 0.007*"include"'),
 (4,
  '0.025*"project" + 0.020*"drinking" + 0.019*"household" + 0.016*"source" + '
  '0.014*"community" + 0.013*"improve" + 0.013*"access" + 0.008*"study" + '
  '0.007*"include" + 0.007*"supply"'),
 (5,
  '0.016*"sampl

In [47]:
# Save model to disk.
temp_file = datapath("model")
lda_model.save(temp_file)

In [21]:
pyLDAvis.enable_notebook()

In [43]:
pyLDAvis.gensim.prepare(lda_model, corpus, words)

In [48]:
# Try topic modeling with lsa
doc_list_lsa=[]
for doc in doc_list :
    doc2=' '.join(doc)
    doc_list_lsa.append(doc2)
len(doc_list_lsa)

630

In [50]:
wordcounter=TfidfVectorizer(stop_words='english', 
                             use_idf=True, 
                             smooth_idf=True)
matrix=wordcounter.fit_transform(doc_list_lsa)
print(matrix.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.22688739]]


In [51]:
dictionary=wordcounter.get_feature_names()

In [52]:
# categorizing words into topics
svd=TruncatedSVD(n_components=10,n_iter=50000,algorithm='randomized')
# grouping text into topics
lsa=svd.fit_transform(matrix)
print(lsa)

[[ 0.2666566  -0.00763495 -0.01564843 ... -0.02073691 -0.06364939
   0.0121469 ]
 [ 0.34647994 -0.1032875  -0.28997223 ...  0.03845197  0.03919895
   0.16808692]
 [ 0.26992308 -0.02726375 -0.08542608 ...  0.05495939  0.02768117
   0.0339079 ]
 ...
 [ 0.18501164  0.28272507  0.02629801 ... -0.01715923 -0.06787498
   0.00091049]
 [ 0.3378486  -0.08600501 -0.20766439 ...  0.05053817 -0.13967204
  -0.06488943]
 [ 0.1627536   0.03705403 -0.07171516 ...  0.03307541 -0.1149492
  -0.02545245]]


In [53]:
for i, comp in enumerate(svd.components_):
    terms_comp = zip(dictionary, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:15]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])

Topic 0: 
sanitation
project
access
supply
community
drought
drinking
child
government
clean
food
improve
population
need
provide
Topic 1: 
flood
food
rain
drought
affect
heavy
displace
flooding
landslide
destroy
damage
house
disaster
crop
emergency
Topic 2: 
project
irrigation
plant
supply
construction
city
dam
minister
agricultural
reservoir
phase
complete
drought
farmer
desalination
Topic 3: 
food
drought
resource
climate
precipitation
renewable
irrigation
source
farmer
agricultural
change
year
agriculture
groundwater
pollution
Topic 4: 
precipitation
renewable
source
drinking
resources
safe
year
city
quality
river
depth
borehole
pollution
volume
use
Topic 5: 
precipitation
flood
renewable
resource
sanitation
management
improve
resources
sector
development
rain
service
program
total
irrigation
Topic 6: 
food
gastrointestinal
project
hand
destination
practice
follow
safety
occur
coli
quality
drinking
infection
traveller
risk
Topic 7: 
precipitation
project
renewable
food
resources
ye