In [None]:
import pandas as pd
import pickle
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
import html
from gensim import corpora, models, utils, parsing
import pyLDAvis.gensim

# We will perform the following steps (Probably):

*   **Tokenization**: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
Words that have fewer than 3 characters are removed.
All stopwords are removed.
*   Words are **Lemmatized** — words in third person are changed to first person and verbs in past and future tenses are changed into present.
*   Words are **stemmed** — words are reduced to their root form.



In [None]:
df = pd.read_json('proctoken.json')

df['verified'] = df['user']
df['verified'] = df['verified'].apply(lambda x: x['verified'])
df = df[df['verified'] == True]
preprocessed_texts = df['tokens']
preprocessed_texts

27       [discovery, wa, made, audit, custom, border, p...
51       [two, outbreak, covid, tied, migrant, agricult...
53       [tigertail, beautifully, acted, family, story,...
57                   [context, diasporic, migrant, writer]
64       [federal, judge, washington, halted, deportati...
                               ...                        
30840    [icymi, supremecourt, hearing, relation, migra...
30859    [depth, gulf, recession, reverberates, across,...
30862    [500, migrant, worker, jerusalem, college, nee...
30892    [since, holocaust, germany, ha, designated, po...
30939    [even, coronavirus, pandemic, hit, migrant, se...
Name: tokens, Length: 2666, dtype: object

*  Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.
*  Filter out tokens that appear in
less than 15 documents (absolute number) or
more than 0.5 documents (fraction of total corpus size, not absolute number)

In [None]:
dictionary = corpora.Dictionary(preprocessed_texts)
print(dictionary)
dictionary.filter_extremes(no_below=15, no_above=0.5) #no_below - appears in minimum 15 documents, #no_above - appears in less than half of the docs in the total corpus
print(dictionary)

Dictionary(7024 unique tokens: ['adult', 'audit', 'border', 'child', 'crossing']...)
Dictionary(540 unique tokens: ['border', 'child', 'family', 'immigration', 'last']...)


For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_texts] #Convert document (a list of words) into the 
                                                                     #bag-of-words format = list of (token_id, token_count) 2-tuples.

In [None]:

tfidf = models.TfidfModel(bow_corpus) #This is a technique to quantify a word in documents, we generally compute a weight to each word... 
#                                       ...which signifies the importance of the word in the document and corpus.
#                                       We perform a normalization on the frequency value. we divide the the frequency 
#                                       with the total number of words in the document
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    # get weights for every word in every document
    break # prints values only for the first tweet

[(0, 0.45944052799343177),
 (1, 0.4528761063656294),
 (2, 0.20205430303651406),
 (3, 0.3080306863511835),
 (4, 0.2539276043298826),
 (5, 0.2617141346191883),
 (6, 0.24481651450916317),
 (7, 0.2617141346191883),
 (8, 0.31708863159146217),
 (9, 0.18960044914556212),
 (10, 0.2240810729933617)]


 Topic modeling is a type of statistical modeling for discovering the abstract “topics” that occur in a collection of documents.
 
It builds a topic per document model and words per topic model, modeled as Dirichlet distributions

Reference: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24


In [None]:
# lda_model_bow = models.LdaMulticore(bow_corpus, num_topics=10, iterations = 100,
                              #         id2word=dictionary, passes=2, workers=2)
# ^^ using bag-of-words


lda_model_tfidf = models.LdaMulticore(corpus_tfidf, iterations=5000, num_topics=10, id2word=dictionary, passes=5, workers=4)

In [None]:
# Using TD-IDF for LDA Model

for idx, topic in lda_model_tfidf.show_topics(formatted=False):
    print('Topic: {} Words: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 Words: labourer|life|stuck|made|matter|part|muslim|public|state|india
Topic: 1 Words: sonu|sood|food|camp|stranded|home|government|ha|family|city
Topic: 2 Words: call|help|need|system|racism|refugee|social|covid|ha|india
Topic: 3 Words: day|court|lockdown|order|state|supreme|india|case|thank|home
Topic: 4 Words: dead|tunisia|exodus|india|crisis|govt|court|long|detention|child
Topic: 5 Words: report|border|child|covid|woman|year|pandemic|via|farm|first
Topic: 6 Words: special|covid|shramik|train|railway|rise|self|story|mgnrega|chinese
Topic: 7 Words: home|state|back|day|within|court|train|send|money|native
Topic: 8 Words: bengal|home|crisis|express|west|sonusood|corona|minister|actor|return
Topic: 9 Words: covid|bjp|singapore|pandemic|working|wage|pay|may|even|ha


In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = models.CoherenceModel(model=lda_model_tfidf, texts=preprocessed_texts, 
                                     dictionary= dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.642611735398933

Coherence Score:  0.37405340987409497


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
pyLDAvis.display(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
