In [None]:
# Vishak Baburaj
# Topic Modelling
# Combining both the responses of students 
# Reasons why the students want the exams to be cancelled
# Reasons why the students want the exams to be conducted on a later date

In [None]:
import pandas as pd

In [None]:
data = pd.read_excel(r'C:\Users\visha\Desktop\Topic Modeling Response Combined.xlsx')
data.head()

# Removing Null Values

In [None]:
data = data.dropna(subset=["combined_response"])
data = data.iloc[1:]
data.head(10)

In [None]:
print(len(data['combined_response']))

# Converting to Lowercase

In [None]:
data['combined_response_preprocess'] = data['combined_response'].str.lower()
data['combined_response_preprocess'].head(5)

In [None]:
data['combined_response_preprocess']=data['combined_response_preprocess'].apply(str)

# Removing Punctuations

In [None]:
import re
def remove_special_characters(sentence,punctuation=False): 
    sentence = sentence.strip() 
    if punctuation: 
        PATTERN = string.punctuation
        filtered_sentence = re.sub(PATTERN, r' ',sentence) 
    else: 
        PATTERN = r'[^a-zA-Z0-9 ]'  
        filtered_sentence = re.sub(PATTERN, r' ',sentence) 
    return filtered_sentence

In [None]:
data['combined_response_preprocess'] = [remove_special_characters(sentence) for sentence in data['combined_response_preprocess']] 
data['combined_response_preprocess']

# Removing Multiple Whitespaces

In [None]:
data['combined_response_preprocess'] = data['combined_response_preprocess'].replace('\s+', ' ', regex = True)
data['combined_response_preprocess']

# Removing Numbers

In [None]:
data['combined_response_preprocess'] = data['combined_response_preprocess'].str.replace('\d+','')
data['combined_response_preprocess']

# Correcting Spelling Mistakes

In [None]:
from autocorrect import Speller
spell = Speller(lang='en')

In [None]:
data['combined_response_preprocess'] = data['combined_response_preprocess'].apply(lambda x: " ".join([spell(i) for i in x.split()]))
data['combined_response_preprocess']

# Lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.VERB
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)



# Lemmatizing
data['combined_response_preprocess'] = data['combined_response_preprocess'].apply(lambda x: lemmatize_sentence(x))
data['combined_response_preprocess']

# Removing Stopwords

In [None]:
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
#STOPWORDS.extend(['exam','covid','conduct','cancel','pandemic','corona','get','due','student','board','also','write','us','mark','go','increase','many','want','take','important','first','reason','well','etc','able','year','need','purpose','must','unable','class','please','good','even','day','th','one','exams','would','know','give','much','may','make'])
#STOPWORDS.extend(['exam','cancel','conduct','student','board','get','also','pandemic','want','us','first','year','mark','go','take','many','th','one','even','need','covid','due','important','corona','reason','increase','day','able','well','etc','case','must','anything','class','write','please','exams','give','would','understand','know','much'])
STOPWORDS.extend(['covid','important','due','get','also','us','many','go','take','please','able','well','exam','cancel','board','want','exams','can','would','much','corona','student','conduct','mark','need','pandemic','corona','would','exam','because','next','etc','pu','st','th'])
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
data['combined_response_preprocess'] = data['combined_response_preprocess'].apply(lambda text: remove_stopwords(text))
print(data['combined_response_preprocess'])

# Word Cloud

In [None]:
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(data['combined_response_preprocess'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=2000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

# Word Tokenization

In [None]:
import gensim
from gensim.utils import simple_preprocess

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data1 = data['combined_response_preprocess'].values.tolist()
data_words = list(sent_to_words(data1))
data_words

In [None]:
print(len(data_words))
print(type(data_words))

# Topic Modeling

In [None]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)
print(len(id2word))
id2word.filter_extremes(no_below=15, no_above=0.5)
print(len(id2word))
#id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
print('Number of unique tokens: %d' % len(id2word))
print('Number of documents: %d' % len(corpus))

# TDIDF

In [None]:
from gensim import models
from pprint import pprint
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# LDA MODEL TFIDF

In [None]:
num_topics = 2
chunksize = 38237
passes = 50
iterations = 1000
eval_every = None
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus_tfidf,
                                            id2word=id2word,
                                            chunksize=chunksize,
                                            alpha='auto',
                                            eta='auto',
                                            iterations=iterations,
                                            num_topics=num_topics,
                                            passes=passes,
                                            eval_every=eval_every,
                                            per_word_topics=True)

pprint(lda_model_tfidf.print_topics())
doc_lda_tfidf = lda_model_tfidf[corpus_tfidf]

# Perplexity AND Coherence

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf)) 

# Compute Coherence Score
from gensim.models import CoherenceModel
coherence_model_lda_tfidf = CoherenceModel(model=lda_model_tfidf, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda_tfidf = coherence_model_lda_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_lda_tfidf)

# Data Visualization

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [None]:
pyLDAvis.enable_notebook()
vis_tfidf = gensimvis.prepare(lda_model_tfidf, corpus_tfidf, dictionary=lda_model_tfidf.id2word)
vis_tfidf

# Top topics

In [None]:
top_topics_tfidf = lda_model_tfidf.top_topics(corpus_tfidf) #, num_words=20)
top_topics_tfidf