In [None]:
# Vishak Baburaj
# Topic Modeling
# Reasons why the students want the exams to be cancelled

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(r"C:\Users\visha\OneDrive\Desktop\python\XII Board Exams-2021.csv")

In [None]:
new_header = {'Timestamp':'timestamp','Which board you are part of?':'syllabus','How do you feel when you are not aware of the status of your board exams? You may select more than one option given below:':'emotion','What is your opinion about Class XII board exams?':'opinion','State the reasons for why do you want the exams to be cancelled? (Safety, Status of mind, Future plans, Health etc.)':'reason_cancelling','State the reasons for why do you want the exams to be conducted on a later date? (Safety, Status of mind, Future plans, Entrance Exams, Health etc.)':'reason_conducting_late','Name the state from which you will be giving your XII Board exam(eg: Karnataka)':'state'}
datanew = data.rename(columns=new_header,inplace=False)
datanew.head(5)

In [None]:
column = ['syllabus','emotion','opinion','reason_cancelling','reason_conducting_late','state']
datanew = datanew[column]
datanew.head(5)

In [None]:
datanew = datanew.iloc[2:]
datanew.head(5)

# Removing Null Values

In [None]:
datanew = datanew.dropna(subset=["reason_cancelling"])
datanew['reason_cancelling'].head(10)

In [None]:
print(len(datanew['reason_cancelling']))

# Converting to Lowercase

In [None]:
datanew['reason_cancelling_preprocess'] = datanew['reason_cancelling'].str.lower()
datanew['reason_cancelling_preprocess'].head(5)

# Removing Punctuations

In [None]:
import re
def remove_special_characters(sentence,punctuation=False): 
    sentence = sentence.strip() 
    if punctuation: 
        PATTERN = string.punctuation
        filtered_sentence = re.sub(PATTERN, r' ',sentence) 
    else: 
        PATTERN = r'[^a-zA-Z0-9 ]'  
        filtered_sentence = re.sub(PATTERN, r' ',sentence) 
    return filtered_sentence

In [None]:
datanew['reason_cancelling_preprocess'] = [remove_special_characters(sentence) for sentence in datanew['reason_cancelling_preprocess']] 
datanew['reason_cancelling_preprocess']

# Removing Multiple Whitespaces

In [None]:
datanew['reason_cancelling_preprocess'] = datanew['reason_cancelling_preprocess'].replace('\s+', ' ', regex = True)
datanew['reason_cancelling_preprocess']

# Removing Numbers

In [None]:
datanew['reason_cancelling_preprocess'] = datanew['reason_cancelling_preprocess'].str.replace('\d+','')
datanew['reason_cancelling_preprocess']

# Correcting Spelling Mistakes

In [None]:
from autocorrect import Speller
spell = Speller(lang='en')

In [None]:
datanew['reason_cancelling_preprocess'] = datanew['reason_cancelling_preprocess'].apply(lambda x: " ".join([spell(i) for i in x.split()]))
datanew['reason_cancelling_preprocess']

# Lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)



# Lemmatizing
datanew['reason_cancelling_preprocess'] = datanew['reason_cancelling_preprocess'].apply(lambda x: lemmatize_sentence(x))
datanew['reason_cancelling_preprocess']

# Removing Stopwords

In [None]:
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['covid','important','due','get','also','us','many','go','take','please','able','well','exam','cancel','board','want','exams','can','would','much','corona','student','conduct','mark','need','pandemic','corona','would','exam','because','next','etc','pu','st','th'])
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
datanew['reason_cancelling_preprocess'] = datanew['reason_cancelling_preprocess'].apply(lambda text: remove_stopwords(text))
print(datanew['reason_cancelling_preprocess'])

# Word Tokenization

In [None]:
import gensim
from gensim.utils import simple_preprocess

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data1 = datanew['reason_cancelling_preprocess'].values.tolist()
data_words = list(sent_to_words(data1))

In [None]:
print(len(data_words))
print(type(data_words))

# Topic Modeling

# Creating dictionary and corpus

In [None]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)
print(len(id2word))
id2word.filter_extremes(no_below=15,no_above=0.5)
print(len(id2word))
#id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
print('Number of unique tokens: %d' % len(id2word))
print('Number of documents: %d' % len(corpus))

# TFIDF

In [None]:
from gensim import models
from pprint import pprint
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# LDA MODEL TFIDF

In [None]:
num_topics = 2
chunksize = 30137
passes = 50
iterations = 1000
eval_every = None
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                            id2word=id2word,
                                            chunksize=chunksize,
                                            alpha='auto',
                                            eta='auto',
                                            iterations=iterations,
                                            num_topics=num_topics,
                                            passes=passes,
                                            eval_every=eval_every,
                                            per_word_topics=True)

pprint(lda_model_tfidf.print_topics())
doc_lda_tfidf = lda_model_tfidf[corpus_tfidf]

# LDA MODEL TFIDF Perplexity and Coherence

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models import CoherenceModel
coherence_model_lda_tfidf = CoherenceModel(model=lda_model_tfidf, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda_tfidf = coherence_model_lda_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_lda_tfidf)

# Visualize the topics

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# LDA MODEL TFIDF visualization

In [None]:
pyLDAvis.enable_notebook()
vis_tfidf = gensimvis.prepare(lda_model_tfidf, corpus_tfidf, dictionary=lda_model_tfidf.id2word)
vis_tfidf

# Top topics LDA MODEL TFIDF

In [None]:
top_topics_tfidf = lda_model_tfidf.top_topics(corpus_tfidf) #, num_words=20)
top_topics_tfidf