# LDA model Bigrams and Trigrams
source: https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920
## import packages

In [31]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from spacy.lang import en
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])


%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## Prepare stop words
# NLTK Stop words

In [44]:
# en_stop = set(nltk.corpus.stopwords.words('english'))
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['programme','accordance','article', 'state','member','this','annex','paragraph'])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salbo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import Dataset

In [45]:
import pandas as pd
data = pd.read_csv("data/data1.csv")
features = data['title'] + " " + data['article']
# Convert to list
features = features.tolist()
features

['(EU 17 2017 establishment Union framework collection management use data fisheries sector Council Regulation (EC 1.With view (EU Regulation management biological, environmental, technical socioeconomic data fisheries sector Article Regulation (EU 1380/2013.2.The data paragraph shall collected obligation collect Union legal acts Regulation.3.For data necessary fisheries management Union legal acts Regulation rules transmission data , processing, management use data collected Regulation (EC (EC 223/2009.For purposes Regulation referred Article 4 Regulation (EU addition definitions sector activities related commercial fisheries recreational fisheries aquaculture industries processing fisheries products;(2)recreational fisheries non-commercial fishing activities marine biological resources recreation tourism sport;(3)marine region geographical area Article 4(2 area regional fisheries management organisations area implementing act Article 9(11);(4)primary data data associated individual v

## Tokenize words and Clean-up text

In [46]:
features = [re.sub('\S*@\S*\s?', '', sent) for sent in features]
# Remove new line characters
features = [re.sub('\s+', ' ', sent) for sent in features]
# Remove distracting single quotes
features = [re.sub("\'", "", sent) for sent in features]
features


['(EU 17 2017 establishment Union framework collection management use data fisheries sector Council Regulation (EC 1.With view (EU Regulation management biological, environmental, technical socioeconomic data fisheries sector Article Regulation (EU 1380/2013.2.The data paragraph shall collected obligation collect Union legal acts Regulation.3.For data necessary fisheries management Union legal acts Regulation rules transmission data , processing, management use data collected Regulation (EC (EC 223/2009.For purposes Regulation referred Article 4 Regulation (EU addition definitions sector activities related commercial fisheries recreational fisheries aquaculture industries processing fisheries products;(2)recreational fisheries non-commercial fishing activities marine biological resources recreation tourism sport;(3)marine region geographical area Article 4(2 area regional fisheries management organisations area implementing act Article 9(11);(4)primary data data associated individual v

Use gensims simple_preprocess(), set deacc=True to remove punctuations.

In [47]:
def sent_to_words(sentences):
  for sentence in sentences:
    yield gensim.utils.simple_preprocess(str(sentence), deacc=True)  #deacc=True removes punctuations
data_words = list(sent_to_words(features))
print(data_words[:1])

[['eu', 'establishment', 'union', 'framework', 'collection', 'management', 'use', 'data', 'fisheries', 'sector', 'council', 'regulation', 'ec', 'with', 'view', 'eu', 'regulation', 'management', 'biological', 'environmental', 'technical', 'socioeconomic', 'data', 'fisheries', 'sector', 'article', 'regulation', 'eu', 'the', 'data', 'paragraph', 'shall', 'collected', 'obligation', 'collect', 'union', 'legal', 'acts', 'regulation', 'for', 'data', 'necessary', 'fisheries', 'management', 'union', 'legal', 'acts', 'regulation', 'rules', 'transmission', 'data', 'processing', 'management', 'use', 'data', 'collected', 'regulation', 'ec', 'ec', 'for', 'purposes', 'regulation', 'referred', 'article', 'regulation', 'eu', 'addition', 'definitions', 'sector', 'activities', 'related', 'commercial', 'fisheries', 'recreational', 'fisheries', 'aquaculture', 'industries', 'processing', 'fisheries', 'products', 'recreational', 'fisheries', 'non', 'commercial', 'fishing', 'activities', 'marine', 'biological

# Creating Bigram and Trigram models
The 2 arguments for Phrases are min_count and threshold. The higher the values of these parameters , the harder its for a word to be combined to bigram.

In [48]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['eu', 'establishment', 'union', 'framework', 'collection', 'management', 'use', 'data', 'fisheries', 'sector', 'council', 'regulation', 'ec', 'with', 'view', 'eu', 'regulation', 'management', 'biological', 'environmental', 'technical', 'socioeconomic', 'data', 'fisheries', 'sector', 'article', 'regulation', 'eu', 'the', 'data', 'paragraph', 'shall', 'collected', 'obligation', 'collect', 'union', 'legal', 'acts', 'regulation', 'for', 'data', 'necessary', 'fisheries', 'management', 'union', 'legal', 'acts', 'regulation', 'rules', 'transmission', 'data', 'processing', 'management', 'use', 'data', 'collected', 'regulation', 'ec', 'ec', 'for', 'purposes', 'regulation', 'referred', 'article', 'regulation', 'eu', 'addition', 'definitions', 'sector', 'activities', 'related', 'commercial', 'fisheries', 'recreational', 'fisheries', 'aquaculture', 'industries', 'processing', 'fisheries', 'products', 'recreational', 'fisheries', 'non', 'commercial', 'fishing', 'activities', 'marine', 'biological'

# Remove Stopwords, make bigrams and lemmatize

In [49]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

Call the functions in order

In [51]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized)



# Create Dictionary and Corpus needed for Topic Modeling
Make sure to check if dictionary[id2word] or corpus is clean otherwise you may not get good quality topics.

In [52]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus)
# id2word[4] # want to see the word correspond to the id

[[(0, 3), (1, 1), (2, 5), (3, 1), (4, 4), (5, 2), (6, 7), (7, 2), (8, 3), (9, 3), (10, 2), (11, 3), (12, 2), (13, 6), (14, 3), (15, 3), (16, 2), (17, 1), (18, 1), (19, 1), (20, 3), (21, 1), (22, 3), (23, 1), (24, 1), (25, 6), (26, 2), (27, 5), (28, 5), (29, 3), (30, 2), (31, 4), (32, 1), (33, 1), (34, 1), (35, 1), (36, 3), (37, 5), (38, 5), (39, 1), (40, 1), (41, 2), (42, 2), (43, 10), (44, 1), (45, 4), (46, 8), (47, 1), (48, 2), (49, 1), (50, 1), (51, 5), (52, 12), (53, 2), (54, 4), (55, 6), (56, 3), (57, 2), (58, 1), (59, 1), (60, 1), (61, 4), (62, 1), (63, 3), (64, 1), (65, 1), (66, 1), (67, 1), (68, 2), (69, 1), (70, 3), (71, 1), (72, 3), (73, 5), (74, 5), (75, 7), (76, 1), (77, 4), (78, 5), (79, 5), (80, 2), (81, 1), (82, 5), (83, 1), (84, 19), (85, 4), (86, 3), (87, 84), (88, 2), (89, 1), (90, 5), (91, 1), (92, 1), (93, 1), (94, 2), (95, 1), (96, 2), (97, 1), (98, 6), (99, 5), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 2), (108, 2), (109, 4), (110

- Gensim creates unique id for each word in the document. Its mapping of word_id and word_frequency. Example: (8,2) above indicates, word_id 8 occurs twice in the document and so on.
- This is used as input to LDA model.
Readable format of corpus can be obtained by executing below code block.

In [56]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('access', 3), ('accidental', 1), ('account', 5), ('achieve', 1), ('act', 4), ('action', 2), ('activity', 7), ('addition', 2), ('additional', 3), ('adequate', 3), ('adopt', 2), ('advice', 3), ('advisory', 2), ('aggregate', 6), ('aggregated', 3), ('agreement', 3), ('allowable', 2), ('alternative', 1), ('amendment', 1), ('analytic', 1), ('annual', 3), ('anonymity', 1), ('applicable', 3), ('application', 1), ('approach', 1), ('appropriate', 6), ('approval', 2), ('aquaculture', 5), ('area', 5), ('arrangement', 3), ('article', 2), ('assess', 4), ('assessment', 1), ('associate', 1), ('assurance', 1), ('attendance', 1), ('available', 3), ('base', 5), ('basis', 5), ('benefit', 1), ('bilateral', 1), ('biological', 2), ('board', 2), ('body', 10), ('business', 1), ('case', 4), ('catch', 8), ('character', 1), ('charge', 2), ('check', 1), ('code', 1), ('collect', 5), ('collection', 12), ('collector', 2), ('commercial', 4), ('commission', 6), ('common_fisherie', 3), ('compatible', 2), ('completene

# Building topic model
### Parameters of LDA
- Alpha and Beta are Hyperparameters — alpha represents document-topic density and Beta represents topic-word density, chunksize is the number of documents to be used in each training chunk, update_every determines how often the model parameters should be updated and passes is the total number of training passes.
- A measure for best number of topics really depends on kind of corpus you are using, the size of corpus, number of topics you expect to see.

In [54]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# lda_model.save('model10_bigram.gensim')
# topics = lda_model.print_topics(num_words=4)

# View topics in LDA model
- Each topic is combination of keywords and each keyword contributes a certain weightage to the topic.
- You can see keywords for each topic and weightage of each keyword using lda_model.print_topics().

In [55]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.001*"fund" + 0.001*"operation" + 0.001*"financial" + '
  '0.001*"implementation" + 0.001*"regulation" + 0.001*"state" + '
  '0.001*"operational" + 0.001*"point" + 0.001*"fishing" + 0.001*"specific"'),
 (1,
  '0.031*"state" + 0.023*"inspection" + 0.023*"control" + 0.022*"agency" + '
  '0.020*"regulation" + 0.017*"plan" + 0.016*"joint_deployment" + '
  '0.012*"activity" + 0.011*"necessary" + 0.010*"implementation"'),
 (2,
  '0.010*"datum" + 0.004*"state" + 0.004*"scientific" + 0.004*"regulation" + '
  '0.003*"management" + 0.003*"regional" + 0.003*"relevant" + '
  '0.003*"information" + 0.002*"fishery" + 0.002*"national"'),
 (3,
  '0.003*"fund" + 0.003*"financial" + 0.002*"specific" + '
  '0.002*"implementation" + 0.002*"operation" + 0.002*"operational" + '
  '0.002*"support" + 0.002*"regulation" + 0.002*"account" + 0.001*"authority"'),
 (4,
  '0.029*"fund" + 0.021*"financial" + 0.015*"operation" + 0.014*"specific" + '
  '0.013*"implementation" + 0.012*"programme" + 0.012*"regu

Topics are words with highest probability in topic and the numbers are the probabilities of words appearing in topic distribution.

# Evaluate topic models
### Compute model Perplexity and Coherence score
Coherence score and perplexity provide a convinent way to measure how good a given topic model is.

In [57]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.351873562059354

Coherence Score:  0.42646375796247743


# Visualize the topic model

In [58]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis