In [37]:
import nltk
nltk.download('stopwords')

import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to /home/srijan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# import the data from the constitution_of_india_article_14.txt file as a list

with open('constitution_of_india_article_32.txt', 'r') as file:
    data = file.read()

# Separate the text into a list by \n

data = data.split('\n')

# print(data[0])

def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
# print(data_words[0])

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
# print(trigram_mod[bigram_mod[data_words[0]]])

In [39]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[0])

['author', 'case', 'involve', 'question', 'relate', 'basic', 'human', 'right', 'question', 'believe', 'multiplicity', 'view', 'give', 'approach', 'member', 'court', 'disadvantage', 'clarifie', 'infrequently', 'differ', 'approach', 'enable', 'interested', 'appreciate', 'well', 'significance', 'general', 'agreement', 'learn', 'endeavour', 'confine', 'observation', 'indication', 'approach', 'matter', 'consideration', 'seem', 'particularly', 'necessary', 'learn', 'brother', 'also', 'give', 'benefit', 'separate', 'opinion', 'somewhat', 'different', 'approach', 'advantage', 'go', 'opinion', 'learn', 'brother', 'seem', 'little', 'doubt', 'right', 'travel', 'go', 'country', 'order', 'regulate', 'issue', 'suspension', 'impound', 'cancellation', 'passport', 'directly', 'affect', 'include', 'right', 'personal_liberty', 'strength', 'decision', 'court', 'give', 'wide', 'ambit', 'right', 'personal_liberty', 'see', 'sawhney', 'officer', 'government', 'state', 'crilj', 'article', 'constitution', 'read

In [40]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 4), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 3), (36, 2), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 3), (48, 2), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 3), (62, 1), (63, 1), (64, 2), (65, 1), (66, 1), (67, 1), (68, 4), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1)]


In [41]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=20,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

# Write this into a file
with open('topics.txt', 'w') as file:
    file.write(str(lda_model.print_topics()))

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

# save the above graph
pyLDAvis.save_html(vis, 'topics.html')


[(0,
  '0.072*"judge" + 0.042*"court" + 0.041*"high" + 0.034*"article" + '
  '0.025*"appointment" + 0.019*"transfer" + 0.017*"office" + 0.015*"hold" + '
  '0.014*"case" + 0.013*"age"'),
 (1,
  '0.057*"bank" + 0.030*"rbi" + 0.020*"currency" + 0.020*"financial" + '
  '0.017*"risk" + 0.016*"payment" + 0.016*"banking" + '
  '0.016*"virtual_currencie" + 0.014*"report" + 0.012*"issue"'),
 (2,
  '0.022*"mining" + 0.019*"area" + 0.012*"lease" + 0.010*"state" + '
  '0.009*"report" + 0.008*"also" + 0.008*"forest" + 0.008*"environment" + '
  '0.007*"mine" + 0.007*"government"'),
 (3,
  '0.023*"appellant" + 0.021*"court" + 0.016*"date" + 0.013*"service" + '
  '0.012*"judgment" + 0.011*"rule" + 0.010*"post" + 0.009*"hold" + '
  '0.009*"member" + 0.008*"licence"'),
 (4,
  '0.067*"notification" + 0.056*"coastal" + 0.025*"main" + 0.023*"plan" + '
  '0.019*"construction" + 0.016*"environment" + 0.015*"crz" + '
  '0.014*"management" + 0.014*"pipeline" + 0.014*"regulation"'),
 (5,
  '0.000*"court" + 0.00

In [42]:
import re

# Read the text file
with open('topics_14.txt', 'r') as file:
    content = file.read()

# Define a regular expression pattern to match all topics
pattern = r'\((\d+), \'(.+?)\'\),'

# Find all matches in the content
matches = re.findall(pattern, content)

# Loop through the matches and print each topic
for match in matches:
    topic_id = match[0]
    topic_terms = match[1]

    # Print the topic ID
    print(f"Topic {topic_id}:")

    # Split terms and weights, then print them
    terms_and_weights = [term.split('*') for term in topic_terms.split(' + ')]
    for term_weight in terms_and_weights:
        term = term_weight[0].strip('"')
        weight = term_weight[1]
        print(f"- {weight} ({term})")

    # Add an empty line between topics for clarity
    print()


Topic 0:
- "judge" (0.072)
- "court" (0.042)
- "high" (0.041)
- "article" (0.034)
- "appointment" (0.025)
- "transfer" (0.019)
- "office" (0.017)
- "hold" (0.015)
- "case" (0.014)
- "age" (0.013)

Topic 1:
- "bank" (0.057)
- "rbi" (0.030)
- "currency" (0.020)
- "financial" (0.020)
- "risk" (0.017)
- "payment" (0.016)
- "banking" (0.016)
- "virtual_currencie" (0.016)
- "report" (0.014)
- "issue" (0.012)

Topic 2:
- "mining" (0.022)
- "area" (0.019)
- "lease" (0.012)
- "state" (0.010)
- "report" (0.009)
- "also" (0.008)
- "forest" (0.008)
- "environment" (0.008)
- "mine" (0.007)
- "government" (0.007)

Topic 3:
- "appellant" (0.023)
- "court" (0.021)
- "date" (0.016)
- "service" (0.013)
- "judgment" (0.012)
- "rule" (0.011)
- "post" (0.010)
- "hold" (0.009)
- "member" (0.009)
- "licence" (0.008)

Topic 4:
- "notification" (0.067)
- "coastal" (0.056)
- "main" (0.025)
- "plan" (0.023)
- "construction" (0.019)
- "environment" (0.016)
- "crz" (0.015)
- "management" (0.014)
- "pipeline" (0.01