In [None]:
# Importing required libraries for data manipulation, NLP, visualization, and clustering
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import numpy as np
import matplotlib.gridspec as gridspec
import os
import nltk
from nltk.corpus import stopwords
import seaborn as sns

# Downloading NLTK stopwords
nltk.download('stopwords')

# Reading data 
df = pd.read_excel('')

# Initial set of stopwords
stop_words = set(stopwords.words('english'))

# Adding custom stopwords to the set
custom_stopwords = {'done','a', 'for', 'i', 'the', 'expand', 'click', 'contain', 'spoiler', 'it', 'be', 'in', 'one', 'get', 'even', 'year', 'guess', 'see', 'got', 'feel', 'want', 'tell', 'absolute','every','is','some','would','else','in','de','said','us','by','little','decided','bethesda','let','must','done','gam'}

# Additional customization of custom stopwords ( These words are repeated for so many times, but they do not provide useful information.) 
additional_stopwords = {'thousands','los','la','al','to','contains','of','the','ago','much','really','ever','games','played','bosses','go','like','good','say','lot','diego','que','give','review','reviews','people','everyone','never','per','boss','also','many','new','may','back','try','vet','made','make','could','spoilers','first','una','fps','not','find'}
custom_stopwords.update(additional_stopwords)

# Removing specific custom stopwords
custom_stopwords.discard('specific')
custom_stopwords.discard('stopword')

# Preprocessing the text by removing stopwords
#tokenized-review is one of the columns of our the dataset
df['tokenized_review'] = df[''].astype(str).apply(
    lambda x: [word for word in x.split() if word not in custom_stopwords]
)

# Topic Modeling class
class Topic_Model:
    def __init__(self, k=15):
        """Initialize the topic model with a specified number of topics (k)."""
        self.k = k
        self.dictionary = None
        self.corpus = None
        self.ldamodel = None
        self.cluster_model = None
        
    def prepare_corpus(self, token_lists):
        """Prepare the dictionary and corpus needed for LDA."""
        self.dictionary = corpora.Dictionary(token_lists)
        self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

    def fit(self):
        """Fit the LDA model to the prepared corpus and dictionary."""
        self.ldamodel = LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary, passes=15)
        topic_distributions = self.ldamodel[self.corpus]
        features = gensim.matutils.corpus2dense(topic_distributions, num_terms=self.k).T
        self.cluster_model = KMeans(n_clusters=self.k, random_state=0)
        self.cluster_model.fit(features)

    def calculate_coherence(self, tokenized_texts):
        """Calculate the coherence score of the LDA model."""
        coherence_model_lda = CoherenceModel(model=self.ldamodel, texts=tokenized_texts, dictionary=self.dictionary, coherence='c_v')
        return coherence_model_lda.get_coherence()

    def calculate_perplexity(self):
        """Calculate the perplexity of the LDA model."""
        return self.ldamodel.log_perplexity(self.corpus)

    def calculate_silhouette_score(self):
        """Calculate silhouette score for the clustering performed by the model."""
        topic_distributions = self.ldamodel[self.corpus]
        features = gensim.matutils.corpus2dense(topic_distributions, num_terms=self.k).T
        return silhouette_score(features, self.cluster_model.labels_)

    def visualize_topics(self):
        """Visualize the topics using pyLDAvis."""
        try:
            vis = gensimvis.prepare(self.ldamodel, self.corpus, self.dictionary)
            pyLDAvis.save_html(vis, 'lda_visualization.html')
        except Exception as e:
            print("An error occurred during LDA visualization:")
            print(e)
            vis = None
        return vis

    def show_topic_words(self, num_words=15):
        """Display the top words for each topic."""
        for i in range(self.k):
            words = self.ldamodel.show_topic(i, topn=num_words)
            print(f"Topic {i + 1}: {', '.join([word for word, _ in words])}")

    def visualize_topics_and_word_clouds(self, num_words=30, grid_dims=(3, 5)):
        """Visualize topics and generate word clouds for each topic."""
        vis = self.visualize_topics()

        topics = self.ldamodel.show_topics(num_topics=self.k, num_words=num_words, formatted=False)
        fig = plt.figure(figsize=(15, 10))
        gs = gridspec.GridSpec(*grid_dims)
        gs.update(wspace=0.5, hspace=0.5)
        for i, topic in enumerate(topics):
            ax = fig.add_subplot(gs[i])
            wordcloud = WordCloud(width=100, height=100, background_color='white').generate_from_frequencies(dict(topic[1]))
            ax.imshow(wordcloud, interpolation='bilinear')
            ax.axis('off')
            ax.set_title(f'Topic {i + 1}')
        plt.show()
        return vis
        
    def plot_clusters(self):
        """Plot the clustering results of the LDA model."""
        if not hasattr(self, 'cluster_model') or not hasattr(self, 'ldamodel'):
            raise ValueError("Topic_Model must be fitted first.")
        topic_distributions = self.ldamodel[self.corpus]
        features = gensim.matutils.corpus2dense(topic_distributions, num_terms=self.k).T
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=self.cluster_model.labels_, palette='viridis', legend='full')
        plt.title('LDA Clustering Results')
        plt.xlabel('Probability of Topic 1')
        plt.ylabel('Probability of Topic 2')
        plt.show()

if __name__ == '__main__':
    tm = Topic_Model(k=10)
    tm.prepare_corpus(df['tokenized_review'].tolist())
    tm.fit()
    coherence = tm.calculate_coherence(df['tokenized_review'].tolist())
    perplexity = tm.calculate_perplexity()
    silhouette_avg = tm.calculate_silhouette_score()
    print(f"Coherence Score: {coherence}")
    print(f"Perplexity: {perplexity}")
    print(f"Silhouette Score: {silhouette_avg}")
    tm.visualize_topics_and_word_clouds()
    tm.show_topic_words(num_words=30)
    tm.plot_clusters()
