In [14]:
import pandas as pd
from gensim.models import Word2Vec, LdaModel
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics import accuracy_score
import tkinter as tk
from tkinter import scrolledtext

# Download the stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def review_to_vector(review, w2v_model):
    vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv.key_to_index]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

class ReviewAnalyzer(tk.Tk):
    def __init__(self, df, lda_model, w2v_model, knn_model):
        super().__init__()

        # Store reviews, lda model, and Word2Vec model
        self.df = df
        self.lda_model = lda_model
        self.w2v_model = w2v_model
        self.knn_classifier = knn_model
        
        # Setup GUI
        self.current_review_index = 0
        self.title("Review Analyzer")
        self.geometry("800x600")

        self.review_label = tk.Label(self, text="Review:")
        self.review_label.pack(pady=10)

        self.review_box = scrolledtext.ScrolledText(self, wrap=tk.WORD, width=60, height=10)
        self.review_box.pack(pady=10)
        self.load_review()

        self.back_button = tk.Button(self, text="Back", command=self.previous_review)
        self.back_button.pack(side=tk.LEFT, padx=10)

        self.analyze_button = tk.Button(self, text="Analyze", command=self.analyze_review)
        self.analyze_button.pack(side=tk.LEFT, padx=10)

        self.next_button = tk.Button(self, text="Next", command=self.next_review)
        self.next_button.pack(side=tk.LEFT, padx=10)

        self.result_label = tk.Label(self, text="", wraplength=600, justify="left")
        self.result_label.pack(pady=10)
        
        self.stop_words = set(stopwords.words('english'))
    
    def review_to_vector(self, review):
        vectors = [self.w2v_model.wv[word] for word in review if word in self.w2v_model.wv.key_to_index]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(self.w2v_model.vector_size)

    def load_review(self):
        review = self.df.iloc[self.current_review_index]['reviews.text']
        self.review_box.delete(1.0, tk.END)
        self.review_box.insert(tk.INSERT, review)

    def next_review(self):
        if self.current_review_index < len(self.df) - 1:
            self.current_review_index += 1
            self.load_review()

    def previous_review(self):
        if self.current_review_index > 0:
            self.current_review_index -= 1
            self.load_review()    
            
    def analyze_review(self):
        review = self.review_box.get("1.0", "end-1c").lower()
        tokenized_review = [word for word in word_tokenize(review) if word not in self.stop_words]

        # Filter out words not in Word2Vec model's vocabulary
        valid_words = [word for word in tokenized_review if word in self.w2v_model.wv.key_to_index]

        # Handle case where no valid words are found
        if not valid_words:
            self.result_label.config(text="No valid words found in the review.")
            return

        review_vector = self.review_to_vector(valid_words)
        sentiment = self.knn_classifier.predict([review_vector])[0]
        sentiment_str = "Positive" if sentiment == 1 else ("Negative" if sentiment == 0 else "Neutral")

        # Perform topic modeling
        bow = self.lda_model.id2word.doc2bow(valid_words)
        topics = self.lda_model.get_document_topics(bow)
        topics_str = "\n".join([" + ".join([word[0] for word in self.lda_model.show_topic(topic[0])]) for topic in topics])

        # Display results
        self.result_label.config(text=f"Sentiment: {sentiment_str}\n\nTopics:\n{topics_str}")

if __name__ == "__main__":
    # Load data from CSV file
    df = pd.read_csv("OnlineProductReviews.csv")

    # Preprocess the dataset
    df = df.dropna(subset=['reviews.text'])
    df['Tokenized_Reviews'] = df['reviews.text'].apply(lambda x: [word for word in word_tokenize(str(x).lower()) if word not in stop_words])

    # Train Word2Vec model
    w2v_model = Word2Vec(sentences=df['Tokenized_Reviews'], vector_size=100, window=5, min_count=1, workers=4)
    w2v_model.train(df['Tokenized_Reviews'], total_examples=w2v_model.corpus_count, epochs=10)

    # Train LDA model on reviews.text
    dictionary = corpora.Dictionary(df['Tokenized_Reviews'])
    dictionary.filter_extremes(no_below=15, no_above=0.5)  # Adjust parameters as needed
    corpus = [dictionary.doc2bow(text) for text in df['Tokenized_Reviews']]
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)  # Increase number of passes if necessary

    # Prepare data for sentiment classification
    df['Vector'] = df['Tokenized_Reviews'].apply(lambda x: review_to_vector(x, w2v_model))
    X = np.array(list(df['Vector']))
    y = df['reviews.rating'].apply(lambda x: 1 if x > 3 else (0 if x < 3 else 2)).values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    print("Accuracy on test set:", accuracy_score(y_test, y_pred))

    # Start GUI
    app = ReviewAnalyzer(df, lda_model, w2v_model, knn_classifier)
    app.mainloop()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Clyde&Charles\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy on test set: 0.55
