In [1]:
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import numpy as np
import string
from gensim import corpora
import gensim
from nltk.corpus import words
from gensim.models import LdaModel, LsiModel, Word2Vec, CoherenceModel, TfidfModel
from gensim.corpora import Dictionary
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from collections import defaultdict
from spacy.lang.en import STOP_WORDS
from pprint import pprint
import gensim.sklearn_api.ldamodel as gensimsklearn
from sklearn.manifold import TSNE
from stemming.porter2 import stem
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Topic coherence and its graph, top topics

In [3]:
# configure number of worker for multicore
import multiprocessing
cpu_count = multiprocessing.cpu_count()
worker = cpu_count - 1
print "No of worker to be used = %s"%worker

No of worker to be used = 7


In [4]:
# stop words
stop_wrd = []
with open('stopword.txt') as f:
    for c in f:
        c = c.replace(",","").replace("\n","").replace('"',"").strip()
        stop_wrd.append(c)
stop = list(STOP_WORDS)
print(len(stop))

305


# Creating Functions

In [5]:
# read document using filename
def read_document(file_name):
    print "Reading Document %s"%file_name
    temp = []
    with open(file_name) as f:
        for l in f:
            temp.append(l)
    print "Document Finished Reading"
    temp = clean_document(temp)
    return temp

In [6]:
#pls check this
def clean_document(doc_complete):
    print "Document Cleaning Started"
    doc_clean = []
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    def clean(doc):
        stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
        stop_free = " ".join([i for i in punc_free.split() if i not in stop_wrd])
        stop_free = ' '.join(i for i in stop_free.split() if not i.startswith('u0'))
        stop_free = " ".join([i for i in stop_free.lower().split() if not unicode(i, 'utf-8').isnumeric()])
        stemm = ' '.join(stem(i) for i in stop_free.split())
        normalized = " ".join(lemma.lemmatize(word) for word in stemm.split())
        return normalized
    doc_clean = [clean(doc).split() for doc in doc_complete]
    print "Document Cleaning Ended"
    return doc_clean

In [7]:
# return token frequency , clean document (using frequency of token)
def frequency_calculate(doc_clean):
    frequency = defaultdict(int)
    for text in doc_clean:
        for token in text:
            frequency[token] += 1        
    # make new text data by removing out the token that have occurence less then 2
    minimum_freq = 2
    len_of_token = 2
    # doc_clean = [[token for token in text if frequency[token] > minimum_freq]
    #           for text in doc_clean]
    doc_clean = [[token for token in text if len(token) > len_of_token and frequency[token] > minimum_freq]
              for text in doc_clean]
    print "Total Number of distinct keywords in document after cleaning is %s"%len(frequency.keys())
    return frequency, doc_clean

# LDA (Latent Dirichlet Allocation) Model -- Based on Probabilistic Graphical Models 

<div class="alert alert-block alert-info"> <font color='Blue'> Interpreting pyLDAvis output:
<ul> <li>Bubbles on the left-hand side of plot represents a topic. Larger the bubble, the more prevalent is that topic. </li>
<li>Good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant. </li>
<li>Model with too many topics, will have many overlaps, small sized bubbles are clustered in one region of the chart.</li>
<li>The words on the right-hand side are the salient keywords that form the selected topic.</li>
<li>The red bars represent the frequency of a term in a given topic, and the blue bars represent term's frequency across the entire corpus.</li> </ul></font> </div>

In [8]:
def prepare_lda_prerequisite(doc_clean):
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return dictionary, doc_term_matrix

In [9]:
def find_lda_optimzal_number_topics(dictionary, corpus, texts, limit=101):
    c_v = [0]*limit
    lm_list = []
    for num_topics in range(10, limit):
        lm = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v[num_topics] = cm.get_coherence()
        print "Coherence of model with number of topic = %s is %s"%(num_topics,cm.get_coherence())
        
    best_topic = c_v.index(max(c_v))
    print "Optimal Number of Topics in LDA model is %s with cohernce value %s"%(best_topic,max(c_v))
    return best_topic

In [10]:
def run_lda(dictionary,doc_term_matrix,num_topics=100):
    Lda = gensim.models.ldamulticore.LdaMulticore
    ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=20, workers=worker)
    return ldamodel

# <div class="alert alert-block alert-info"><font color='red'> Non-Anonymous User (Neutral Comments) <font> </div> #



In [11]:
# read the document
doc_cleaned = read_document('verified_neutralcomment.txt')

Reading Document verified_neutralcomment.txt
Document Finished Reading
Document Cleaning Started
Document Cleaning Ended


In [12]:
frequency_doc, doc_cleaned = frequency_calculate(doc_cleaned)

Total Number of distinct keywords in document after cleaning is 68745


In [13]:
dictionary, doc_term_matrix = prepare_lda_prerequisite(doc_cleaned)

In [14]:
num_lda_topics =  find_lda_optimzal_number_topics(dictionary,doc_term_matrix,doc_cleaned)

Coherence of model with number of topic = 10 is 0.3884594562097606
Coherence of model with number of topic = 11 is 0.42850638051711853
Coherence of model with number of topic = 12 is 0.41004652238313893
Coherence of model with number of topic = 13 is 0.41985714338506974
Coherence of model with number of topic = 14 is 0.38646275965772
Coherence of model with number of topic = 15 is 0.36067722259668156
Coherence of model with number of topic = 16 is 0.3968195699442264
Coherence of model with number of topic = 17 is 0.3930782587506408
Coherence of model with number of topic = 18 is 0.39643927718244437
Coherence of model with number of topic = 19 is 0.3995560802425593
Coherence of model with number of topic = 20 is 0.40711261779411945
Coherence of model with number of topic = 21 is 0.4084756683368156
Coherence of model with number of topic = 22 is 0.3762579413622374
Coherence of model with number of topic = 23 is 0.4023326442663773
Coherence of model with number of topic = 24 is 0.40166520

In [15]:
ldamodel = run_lda(dictionary,doc_term_matrix,num_lda_topics)

In [16]:
%time
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.06 µs


In [None]:
# Topic 1: Border issues (dispute, dialogue)
# Topic 2: Elections (vote, choice, support)
# Topic 3: Crime (issue, charge, proof, solution, prison)
# Topic 4: Politics (respect, flag, compaign)
# Topic 5: Food (noodle, test, eat, complain)
# Topic 6: Economy (finance, debt, trade)
# Topic 7: Corruption (Scam)
# Topic 8: Crime (blame, loot, preach)
# Topic 9: Social (work, target, riot, rally)
# Topic 10: Economy (tax, income, justify)
# Topic 11: Politics (political party, govern, function)
# Topic 12: Politics (scandal, drama, change)
# Topic 13: Socio-political (farmer, purchase, fail, govt.)
# Topic 14: Social (rape, defence, police)
# Topic 15: Monetray (job, money, spend, reserve)
# Topic 16: Border issue (battle, flee, democracy)
# Topic 17: Crime (police, report, incident, night, protest)
# Topic 18: Politics (bribe, worker, health, rape)


# <div class="alert alert-block alert-info"><font color='red'> Anonymous user (Neutral Comments) </font></div>

In [17]:
# read the document
doc_cleaned = read_document('anony_neutralcomment.txt')

Reading Document anony_neutralcomment.txt
Document Finished Reading
Document Cleaning Started
Document Cleaning Ended


In [18]:
frequency_doc, doc_cleaned = frequency_calculate(doc_cleaned)

Total Number of distinct keywords in document after cleaning is 249


In [19]:
dictionary, doc_term_matrix = prepare_lda_prerequisite(doc_cleaned)

In [20]:
num_lda_topics =  find_lda_optimzal_number_topics(dictionary,doc_term_matrix,doc_cleaned)

Coherence of model with number of topic = 10 is 0.540295292393013
Coherence of model with number of topic = 11 is 0.540295292393013
Coherence of model with number of topic = 12 is 0.540295292393013
Coherence of model with number of topic = 13 is 0.5402952923930129
Coherence of model with number of topic = 14 is 0.5402952923930131
Coherence of model with number of topic = 15 is 0.5402952923930131
Coherence of model with number of topic = 16 is 0.5402952923930129
Coherence of model with number of topic = 17 is 0.5402952923930129
Coherence of model with number of topic = 18 is 0.5402952923930129
Coherence of model with number of topic = 19 is 0.540295292393013
Coherence of model with number of topic = 20 is 0.5402952923930131
Coherence of model with number of topic = 21 is 0.540295292393013
Coherence of model with number of topic = 22 is 0.540295292393013
Coherence of model with number of topic = 23 is 0.540295292393013
Coherence of model with number of topic = 24 is 0.5402952923930129
Co

In [21]:
ldamodel = run_lda(dictionary,doc_term_matrix,num_lda_topics)

In [22]:
%time
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs


In [None]:
# Topic 1: Politics (government, corruption, sell, buy)
# Topic 2: Politics (government, corruption, work, deal, sell, buy)
# Topic 3: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 4: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 5: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 6: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 7: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 8: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 9: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 10: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 11: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 12: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 13: Politics (government, corruption, work, deal, sell, buy, daily)
# Topic 14: Politics (government, corruption, work, deal, sell, buy, daily)


# <div class="alert alert-block alert-info"><font color='red'> Users with Anonymous profiles (Neutral Comments) </font> </div>

In [23]:
# read the document
doc_cleaned = read_document('null_neutralcomment.txt')

Reading Document null_neutralcomment.txt
Document Finished Reading
Document Cleaning Started
Document Cleaning Ended


In [24]:
frequency_doc, doc_cleaned = frequency_calculate(doc_cleaned)

Total Number of distinct keywords in document after cleaning is 4491


In [25]:
dictionary, doc_term_matrix = prepare_lda_prerequisite(doc_cleaned)

In [26]:
num_lda_topics =  find_lda_optimzal_number_topics(dictionary,doc_term_matrix,doc_cleaned)

Coherence of model with number of topic = 10 is 0.4544868092379448
Coherence of model with number of topic = 11 is 0.44635306165959227
Coherence of model with number of topic = 12 is 0.47514247857186814
Coherence of model with number of topic = 13 is 0.4709240796803525
Coherence of model with number of topic = 14 is 0.474509600752434
Coherence of model with number of topic = 15 is 0.46212186218754614
Coherence of model with number of topic = 16 is 0.49127535428535496
Coherence of model with number of topic = 17 is 0.46590443081381255
Coherence of model with number of topic = 18 is 0.4700213278171936
Coherence of model with number of topic = 19 is 0.47062810767834695
Coherence of model with number of topic = 20 is 0.48652990151947045
Coherence of model with number of topic = 21 is 0.4981823166847845
Coherence of model with number of topic = 22 is 0.4853341716269552
Coherence of model with number of topic = 23 is 0.4767173151471924
Coherence of model with number of topic = 24 is 0.480680

In [27]:
ldamodel = run_lda(dictionary,doc_term_matrix,num_lda_topics)

In [28]:
%time
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


In [None]:
# Topic 1: Crime (case, hide, drama, sting operation)
# Topic 2: Technology (Update on whatsapp)
# Topic 3: Entertainment (Song, movie, act, actor)
# Topic 4: Crime (Criminal underground)
# Topic 5: Politics (democracy, election, leader)
# Topic 6: Politics (drama, party)
# Topic 7: Politics (congratulate, people, bill)
# Topic 8: Internal politics and border issues (blame, shame, pay)
# Topic 9: Crime (attack, terror, fool)
# Topic 10: Technology (email, comment, subscribe)
# Topic 11: Crime (court, order, police, rescue)
# Topic 12: Crime (cheat, hit, settle, violence)
# Topic 13: Communal (Ban on muslim organisations)
# Topic 14: Social (racism, slavery, secularism, freedom)
# Topic 15: Social (traitor, land, voter, economy)
# Topic 16: Politics (party, vote, mission)
# Topic 17: Politics (patriot, insult, fight, justice)
# Topic 18: Communal (religion, follow, peace)