In [1]:
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import numpy as np
import string
from gensim import corpora
import gensim
from nltk.corpus import words
from gensim.models import LdaModel, LsiModel, Word2Vec, CoherenceModel, TfidfModel
from gensim.corpora import Dictionary
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from collections import defaultdict
from spacy.lang.en import STOP_WORDS
from pprint import pprint
import gensim.sklearn_api.ldamodel as gensimsklearn
from sklearn.manifold import TSNE
from stemming.porter2 import stem
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Topic coherence and its graph, top topics

In [3]:
# configure number of worker for multicore
import multiprocessing
cpu_count = multiprocessing.cpu_count()
worker = cpu_count - 1
print "No of worker to be used = %s"%worker

No of worker to be used = 7


In [4]:
# stop words
stop_wrd = []
with open('stopword.txt') as f:
    for c in f:
        c = c.replace(",","").replace("\n","").replace('"',"").strip()
        stop_wrd.append(c)
stop = list(STOP_WORDS)
print(len(stop))

305


# Creating Functions

In [5]:
# read document using filename
def read_document(file_name):
    print "Reading Document %s"%file_name
    temp = []
    with open(file_name) as f:
        for l in f:
            temp.append(l)
    print "Document Finished Reading"
    temp = clean_document(temp)
    return temp

In [6]:
#pls check this
def clean_document(doc_complete):
    print "Document Cleaning Started"
    doc_clean = []
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    def clean(doc):
        stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
        stop_free = " ".join([i for i in punc_free.split() if i not in stop_wrd])
        stop_free = ' '.join(i for i in stop_free.split() if not i.startswith('u0'))
        stop_free = " ".join([i for i in stop_free.lower().split() if not unicode(i, 'utf-8').isnumeric()])
        stemm = ' '.join(stem(i) for i in stop_free.split())
        normalized = " ".join(lemma.lemmatize(word) for word in stemm.split())
        return normalized
    doc_clean = [clean(doc).split() for doc in doc_complete]
    print "Document Cleaning Ended"
    return doc_clean

In [7]:
# return token frequency , clean document (using frequency of token)
def frequency_calculate(doc_clean):
    frequency = defaultdict(int)
    for text in doc_clean:
        for token in text:
            frequency[token] += 1        
    # make new text data by removing out the token that have occurence less then 2
    minimum_freq = 2
    len_of_token = 2
    # doc_clean = [[token for token in text if frequency[token] > minimum_freq]
    #           for text in doc_clean]
    doc_clean = [[token for token in text if len(token) > len_of_token and frequency[token] > minimum_freq]
              for text in doc_clean]
    print "Total Number of distinct keywords in document after cleaning is %s"%len(frequency.keys())
    return frequency, doc_clean

# LDA (Latent Dirichlet Allocation) Model -- Based on Probabilistic Graphical Models 

<div class="alert alert-block alert-info"> <font color='Blue'> Interpreting pyLDAvis output:
<ul> <li>Bubbles on the left-hand side of plot represents a topic. Larger the bubble, the more prevalent is that topic. </li>
<li>Good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant. </li>
<li>Model with too many topics, will have many overlaps, small sized bubbles are clustered in one region of the chart.</li>
<li>The words on the right-hand side are the salient keywords that form the selected topic.</li>
<li>The red bars represent the frequency of a term in a given topic, and the blue bars represent term's frequency across the entire corpus.</li> </ul></font> </div>

In [8]:
def prepare_lda_prerequisite(doc_clean):
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return dictionary, doc_term_matrix

In [9]:
def find_lda_optimzal_number_topics(dictionary, corpus, texts, limit=101):
    c_v = [0]*limit
    lm_list = []
    for num_topics in range(10, limit):
        lm = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v[num_topics] = cm.get_coherence()
        print "Coherence of model with number of topic = %s is %s"%(num_topics,cm.get_coherence())
        
    best_topic = c_v.index(max(c_v))
    print "Optimal Number of Topics in LDA model is %s with cohernce value %s"%(best_topic,max(c_v))
    return best_topic

In [10]:
def run_lda(dictionary,doc_term_matrix,num_topics=100):
    Lda = gensim.models.ldamulticore.LdaMulticore
    ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=20, workers=worker)
    return ldamodel

# <div class="alert alert-block alert-info"><font color='red'> Non-Anonymous User (Negative Comments) <font> </div> #



In [11]:
# read the document
doc_cleaned = read_document('verified_negativecomment.txt')

Reading Document verified_negativecomment.txt
Document Finished Reading
Document Cleaning Started
Document Cleaning Ended


In [12]:
frequency_doc, doc_cleaned = frequency_calculate(doc_cleaned)

Total Number of distinct keywords in document after cleaning is 84309


In [13]:
dictionary, doc_term_matrix = prepare_lda_prerequisite(doc_cleaned)

In [14]:
num_lda_topics =  find_lda_optimzal_number_topics(dictionary,doc_term_matrix,doc_cleaned)

Coherence of model with number of topic = 10 is 0.4650348124466592
Coherence of model with number of topic = 11 is 0.46869489612356696
Coherence of model with number of topic = 12 is 0.47594738496942496
Coherence of model with number of topic = 13 is 0.4477310305931159
Coherence of model with number of topic = 14 is 0.4659642754021219
Coherence of model with number of topic = 15 is 0.45776083872758433
Coherence of model with number of topic = 16 is 0.47029336237378283
Coherence of model with number of topic = 17 is 0.44736484328168596
Coherence of model with number of topic = 18 is 0.4406028931391488
Coherence of model with number of topic = 19 is 0.46072878462509476
Coherence of model with number of topic = 20 is 0.46818213721080326
Coherence of model with number of topic = 21 is 0.4673397787108058
Coherence of model with number of topic = 22 is 0.4381743147480405
Coherence of model with number of topic = 23 is 0.44177450133348395
Coherence of model with number of topic = 24 is 0.4596

In [15]:
ldamodel = run_lda(dictionary,doc_term_matrix,num_lda_topics)

In [16]:
%time
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.05 µs


In [None]:
# Topic 1: Politics(corruption, scam) 
# Topic 2: Education and job (food, less jobs)
# Topic 3: Politics (Blame, fool, stupid, idiot, fool)
# Topic 4: Accident (Phone, car, danger, fall)
# Topic 5: Border issues (army, terrorist, kill, attack, war)
# Topic 6: Communal disharmony (secular, kill, hate, convert)
# Topic 7: Law and Order (Police, arrest, criminal, punish, jail, illegal)
# Topic 8: Crime (kill, murder, death, punish)
# Topic 9: Economy and politics (bad, wrong, criticise)
# Topic 10: Black money (tax, jail, income, interest)
# Topic 11: Corruption (lie, price, inflation)
# Topic 12: Government policies and law (fake, useless, failure, resign, murder)

# <div class="alert alert-block alert-info"><font color='red'> Anonymous user (Negative Comments) </font></div>

In [17]:
# read the document
doc_cleaned = read_document('anony_negativecomment.txt')

Reading Document anony_negativecomment.txt
Document Finished Reading
Document Cleaning Started
Document Cleaning Ended


In [18]:
frequency_doc, doc_cleaned = frequency_calculate(doc_cleaned)

Total Number of distinct keywords in document after cleaning is 704


In [19]:
dictionary, doc_term_matrix = prepare_lda_prerequisite(doc_cleaned)

In [20]:
num_lda_topics =  find_lda_optimzal_number_topics(dictionary,doc_term_matrix,doc_cleaned)

Coherence of model with number of topic = 10 is 0.47041144585314515
Coherence of model with number of topic = 11 is 0.4752156096177307
Coherence of model with number of topic = 12 is 0.4652309033891455
Coherence of model with number of topic = 13 is 0.47096174824640663
Coherence of model with number of topic = 14 is 0.4672498912320216
Coherence of model with number of topic = 15 is 0.4609842425494831
Coherence of model with number of topic = 16 is 0.5013875116047704
Coherence of model with number of topic = 17 is 0.4997698082706584
Coherence of model with number of topic = 18 is 0.4910470934573593
Coherence of model with number of topic = 19 is 0.5519267309991983
Coherence of model with number of topic = 20 is 0.5234043543468954
Coherence of model with number of topic = 21 is 0.4718209901024699
Coherence of model with number of topic = 22 is 0.44000539847202447
Coherence of model with number of topic = 23 is 0.5037170512879208
Coherence of model with number of topic = 24 is 0.436161598

In [21]:
ldamodel = run_lda(dictionary,doc_term_matrix,num_lda_topics)

In [22]:
%time
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


In [None]:
# Topic 1: Politics (government, vote, party, wrong, case)
# Topic 2: Workplace (Employee, quit, penalty, death)
# Topic 3: Crime (man poisons woman)
# Topic 4: Crime (husband throws wife, money, treason, court)
# Topic 5: Politics (vote, blame, wrong)
# Topic 6: Politics (fail, vote, treason)
# Topic 7: Politics (hurt, fail, death)
# Topic 8: Accident (car accident, death)
# Topic 9: Crime (prison, money, penalty)
# Topic 10: Crime (child exploitation)
# Topic 11: Judiciary (money, treason, penalty)
# Topic 12: Crime (penalty, poison, throw, condition)
# Topic 13: Workplace (fail, quit, treason, money)
# Topic 14: Politics (Government, fail, penalty)
# Topic 15: Crime (money, court, penalty)
# Topic 16: Crime (court, penalty, hurt, doubt)
# Topic 17: Crime (court, hurt, prove, fail)
# Topic 18: Crime (money, court, fail, limit)

# <div class="alert alert-block alert-info"><font color='red'> Users with Anonymous profiles (Negative Comments) </font> </div>

In [23]:
# read the document
doc_cleaned = read_document('null_negativecomment.txt')

Reading Document null_negativecomment.txt
Document Finished Reading
Document Cleaning Started
Document Cleaning Ended


In [24]:
frequency_doc, doc_cleaned = frequency_calculate(doc_cleaned)

Total Number of distinct keywords in document after cleaning is 6991


In [25]:
dictionary, doc_term_matrix = prepare_lda_prerequisite(doc_cleaned)

In [26]:
num_lda_topics =  find_lda_optimzal_number_topics(dictionary,doc_term_matrix,doc_cleaned)

Coherence of model with number of topic = 10 is 0.2893470678649528
Coherence of model with number of topic = 11 is 0.3210477087750558
Coherence of model with number of topic = 12 is 0.31533488906393614
Coherence of model with number of topic = 13 is 0.30531871517252135
Coherence of model with number of topic = 14 is 0.3025408823444958
Coherence of model with number of topic = 15 is 0.30805440917313415
Coherence of model with number of topic = 16 is 0.28828237070985735
Coherence of model with number of topic = 17 is 0.31287576581548016
Coherence of model with number of topic = 18 is 0.2866261226860264
Coherence of model with number of topic = 19 is 0.3050257289883424
Coherence of model with number of topic = 20 is 0.30429593895076607
Coherence of model with number of topic = 21 is 0.2960015710613187
Coherence of model with number of topic = 22 is 0.3117491042248529
Coherence of model with number of topic = 23 is 0.31009585656029026
Coherence of model with number of topic = 24 is 0.30843

In [27]:
ldamodel = run_lda(dictionary,doc_term_matrix,num_lda_topics)

In [28]:
%time
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [None]:
# Topic 1: Law (anti-slaughter cow law, reality)
# Topic 2: Economy (cash, cashless, corruption)
# Topic 3: Female exploitation (patriarch society)
# Topic 4: Colonisation (exploitation of people)
# Topic 5: Corruption (govt. staff, ombudsman, notice, investigation)
# Topic 6: Border water issues
# Topic 7: Black money (list, Supreme Court)
# Topic 8: Economy (Corruption, infrastructure, failure)
# Topic 9: School (police, boy, girl, friend, shoots)
# Topic 10: Politics (hide, poor, shame, faith, coward) 
# Topic 11: Corporate politics (land, farmer)
# Topic 12: Moral issues in kid (blast, bomb, lie, problem)
# Topic 13: Law (summon, court, law)
# Topic 14: Politics (vote, stupid, foolish, Encompass, corrupt)
# Topic 15: Politics (two-faced politician, real face, ugly, pretend)
# Topic 16: Work (money, hate work, watch, fun)
# Topic 17: Sports (team, hardwork, time left)
# Topic 18: Politics (power, conspiracy, poor hungry ppl, failure)