In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from gensim import corpora, models, matutils
from datetime import datetime
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.models import CoherenceModel
from wordcloud import WordCloud
import matplotlib.gridspec as gridspec

# Downloading NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Defining Topic Modeling Class
class Topic_Model:
    def __init__(self, k=15, method='LDA_BERT', gamma=15, token_lists=None):
        self.k = k
        self.dictionary = None
        self.corpus = None
        self.cluster_model = None
        self.ldamodel = None
        self.vec = {}
        self.gamma = gamma  # Relative importance of LDA
        self.method = method
        self.id = method + '_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.token_lists = token_lists
        
    def vectorize(self, sentences, token_lists, method=None):
        if method is None:
            method = self.method

        if method == 'LDA':
            if not self.ldamodel:
                self.dictionary = corpora.Dictionary(token_lists)
                self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]
                self.ldamodel = models.ldamodel.LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary, passes=20)
            vec = matutils.corpus2dense(self.ldamodel[self.corpus], num_terms=self.k).T
            return vec

        elif method == 'BERT':
            model = SentenceTransformer('bert-base-nli-max-tokens')
            vec = np.array(model.encode(sentences, show_progress_bar=True))
            return vec

        elif method == 'LDA_BERT':
            vec_lda = self.vectorize(sentences, token_lists, method='LDA')
            vec_bert = self.vectorize(sentences, None, method='BERT')
            vec_ldabert = np.c_[vec_lda * self.gamma, vec_bert]
            return vec_ldabert

    def calculate_coherence(self, token_lists):
        if not self.ldamodel or not self.corpus:
            raise ValueError("LDA model and corpus must be trained before calculating coherence.")
        coherence_model_lda = CoherenceModel(model=self.ldamodel, texts=token_lists, dictionary=self.dictionary, coherence='c_v')
        return coherence_model_lda.get_coherence()

    def calculate_perplexity(self):
        if not self.ldamodel or not self.corpus:
            raise ValueError("LDA model and corpus must be trained before calculating perplexity.")
        return self.ldamodel.log_perplexity(self.corpus)

    def fitting(self, sentences, token_lists, method=None):
        if method is None:
            method = self.method
        vec = self.vectorize(sentences, token_lists, method)
        self.vec[method] = vec
    
        if method in ['LDA_BERT', 'BERT']:
            # Clustering for BERT or LDA_BERT
            self.cluster_model = KMeans(n_clusters=self.k, random_state=0)
            self.cluster_model.fit(vec)
            silhouette_avg = silhouette_score(vec, self.cluster_model.labels_)
            print(f"Silhouette Score: {silhouette_avg}")

        
        elif method == 'LDA': 
            pass


    def visualizing_topics(self):
        try:
            if not self.ldamodel:
                raise ValueError("LDA model must be fitted first.")
    
            if self.method == 'LDA':
                # For LDA: Visualize using pyLDAvis and save to HTML
                vis = gensimvis.prepare(self.ldamodel, self.corpus, self.dictionary)
                pyLDAvis.save_html(vis, 'lda_visualization2.html')
                print("LDA visualization saved as lda_visualization2.html")
    
            elif self.method in ['BERT', 'LDA_BERT']:
                # Ensuring the vectors exist
                if self.method not in self.vec:
                    raise ValueError(f"Vectors for method {self.method} not found. Ensure that you've fitted the model with '{self.method}' method.")
    
                # t-SNE for dimensionality reduction, assuming 'self.vec' holds our method-specific vectors
                tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
                tsne_vectors = tsne_model.fit_transform(self.vec[self.method])
    
                # Adjusting labels to start from 1 instead of 0
                hue_labels = [label + 1 for label in self.cluster_model.labels_]
    
                # Plotting
                plt.figure(figsize=(16, 10))
                sns.scatterplot(
                    x=tsne_vectors[:, 0], y=tsne_vectors[:, 1],
                    hue=hue_labels,  # Use adjusted labels
                    palette=sns.color_palette("hls", self.k),
                    legend="full",
                    alpha=0.7
                )
                plt.title(f"t-SNE visualization of {self.method} clusters")
                plt.show()
    
        except Exception as e:
            print(f"An error occurred during topic visualization: {e}")


    
    def getting_top_words_for_each_cluster(self, num_words=10):
        if not self.ldamodel or not self.cluster_model:
            raise ValueError("LDA model and cluster model must be fitted first.")
        
        # Ensuring num_words matches the number of clusters, self.k
        num_words = self.k  # Set num_words to the number of clusters
        
        top_words_per_cluster = {}
        for cluster_id in range(self.k):
            cluster_indices = np.where(self.cluster_model.labels_ == cluster_id)[0]
            cluster_sentences = [self.token_lists[i] for i in cluster_indices]
            cluster_tokenized_words = [word for sentence in cluster_sentences for word in sentence]
            word_counts = nltk.FreqDist(cluster_tokenized_words)
            top_words = [word for word, _ in word_counts.most_common(num_words)]
            top_words_per_cluster[cluster_id] = top_words
        
        return top_words_per_cluster



    def generating_word_clouds(self, num_words=50):
    # Ensuring that the LDA model has been trained
        if not self.ldamodel:
            raise ValueError("LDA model must be trained before generating word clouds.")

        for t in range(self.ldamodel.num_topics):
            # Extracting the words and their probabilities for the topic
            topic_words_probs = self.ldamodel.show_topic(t, topn=num_words)
            # Creating a dictionary with word-probability pairs
            topic_words_probs_dict = {word: prob for word, prob in topic_words_probs}

            # Generating the word cloud
            wordcloud = WordCloud(background_color='white').generate_from_frequencies(topic_words_probs_dict)

            
            plt.figure(figsize=(10, 10))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Word Cloud for Topic #{t}')
            plt.show()
        
    def showing_topic(self, topic_index, num_words=15):
        if not self.ldamodel:
            raise ValueError("LDA model must be fitted first.")
        if topic_index < 0 or topic_index >= self.k:
            raise ValueError(f"Invalid topic index. Please choose a value between 0 and {self.k - 1}")
        topic_words = self.ldamodel.show_topic(topic_index, topn=num_words)
        print(f"Topic {topic_index}:")
        for word, prob in topic_words:
            print(f"  {word}: {prob:.4f}")

def showing_all_topics(self, num_words=15):
    if not self.ldamodel:
        raise ValueError("LDA model must be fitted first.")
    for topic_index in range(self.k):
        # Adjusting the topic index for display
        adjusted_topic_index = topic_index + 1
        self.show_topic(adjusted_topic_index, num_words)
        print()


if __name__ == '__main__':
  
    file_path = ''
    df = pd.read_excel(file_path)
    df = df.fillna('')
# Additional customization of custom stopwords ( These words are repeated for so many times, but they do not provide useful information.) 
    custom_stopwords = {'done','a', 'for', 'i', 'the', 'expand', 'click', 'contain', 'spoiler', 'it', 'be', 'in', 'one', 'get', 'even', 'year', 'guess', 'see', 'got', 'feel', 'want', 'tell', 'absolute','every','is','some','would','else','in','de','said','us','by','little','decided','bethesda','let','must','done','gam','thousands','los','la','al','to','contains','of','the','ago','much','really','ever','games','played','bosses','go','like','good','say','lot','diego','que','give','review','reviews','people','everyone','never','per','boss','also','many','new','may','back','try','vet','made','make','could','spoilers','first','una','fps','not','find'}
    local_stopwords = stop_words.union(custom_stopwords)
    df['tokenized_review'] = df[''].apply(lambda x: [word for word in x.split() if word not in local_stopwords] if isinstance(x, str) else [])

    token_lists = df['tokenized_review'].tolist()
    tm = Topic_Model(k=10, method='LDA_BERT', gamma=20, token_lists=token_lists)

    # Fitting the model
    sentences = df[''].tolist()
    tm.fitting(sentences, token_lists)

    # Depending on the method, performing the respective tasks
    if tm.method == 'LDA':
        # For LDA: Calculating coherence and perplexity
        coherence = tm.calculate_coherence(token_lists)
        print(f"Coherence Score: {coherence}")
        
        perplexity = tm.calculate_perplexity()
        print(f"Perplexity: {perplexity}")

        # Generating word clouds and visualizing topics
        tm.generating_word_clouds(num_words=50)
        tm.visualizing_topics()
        tm.showing_all_topics()

    elif tm.method == 'BERT':
        # For BERT: Getting top words for each cluster, and visualizing topics
        top_words_per_cluster = tm.getting_top_words_for_each_cluster()
        print(top_words_per_cluster)

        tm.visualizing_topics()

    elif tm.method == 'LDA_BERT':
 
        coherence = tm.calculate_coherence(token_lists)
        print(f"Coherence Score: {coherence}")
        
        perplexity = tm.calculate_perplexity()
        print(f"Perplexity: {perplexity}")

        top_words_per_cluster = tm.getting_top_words_for_each_cluster()
        print(top_words_per_cluster)

        tm.generating_word_clouds(num_words=50)
        tm.visualizing_topics()
        tm.showing_all_topics()

    else:
        print(f"Method {tm.method} not recognized.")
