In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from gensim import corpora, models
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from itertools import chain
from pprint import pprint
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# Load the dataset
dataset_path = '../Data/lab4_train.csv'
data = pd.read_csv(dataset_path)
data = data[data['polarity'] != 'conflict']
max_len = 100  # Define the maximum length of sequences

In [3]:
# LSTM preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
X_lstm = tokenizer.texts_to_sequences(data['text'])
X_lstm = pad_sequences(X_lstm, maxlen=max_len)

# LDA preprocessing
vectorizer = CountVectorizer()
X_lda = vectorizer.fit_transform(data['text'])
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

# Encode the polarity column
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['polarity'])

# Convert encoded labels to one-hot encoding
y = to_categorical(y_encoded)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(LSTM(128))
model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_lstm, y, epochs=10, batch_size=32, validation_split=0.2)

# Train logistic regression on LDA features
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size=0.2, random_state=42)
y_train_labels = np.argmax(y_train, axis=1)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train_labels)
y_pred_lr = lr.predict(X_test)
y_test_multiclass = np.argmax(y_test, axis=1)
print("Accuracy of Logistic Regression on LDA features:", accuracy_score(y_test_multiclass, y_pred_lr))

# Sentiment analysis setup
sia = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_sm")

# LDA setup
def preprocess_text(text):
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    tokens = simple_preprocess(text)
    tokens_lem = [WordNetLemmatizer().lemmatize(token, pos='v') for token in tokens]
    processed = [token for token in tokens_lem if len(token) > 3]
    return processed

def get_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores['compound'] > 0:
        return 'positive'
    elif scores['compound'] < 0:
        return 'negative'
    else:
        return 'neutral'

aspects = ['service', 'food', 'anecdotes/miscellaneous', 'price', 'ambience']
ldaTopic_to_aspect = {0: 'service', 1: 'food', 2: 'anecdotes/miscellaneous', 3: 'price', 4: 'ambience'}

# Preprocess reviews for LDA
reviews_processed = [preprocess_text(text) for text in data['text']]
dictionary = corpora.Dictionary(reviews_processed)
corpus_bow = [dictionary.doc2bow(text) for text in reviews_processed]
tfidf = models.TfidfModel(corpus_bow) 
corpus_tfidf = tfidf[corpus_bow]
ldamodel = models.LdaModel(corpus_bow, num_topics=5, id2word=dictionary, passes=10, random_state=12)

# Function to get descriptors
def get_descriptors(text):
    def is_adjective(token):
        return token.dep_ == 'amod' or token.pos_ == 'ADJ'
    def get_children(token):
        first_ch = [child for child in token.children if child.pos_ not in ['AUX', 'VERB']]
        second_ch = [list(ch.children) for ch in first_ch]
        second_ch = list(chain.from_iterable(second_ch))
        return first_ch + second_ch
    subjects_descriptors = {}
    for token in nlp(text):
        if token.dep_ == 'nsubj' and token.pos_ != 'PRON':
            descriptors = []
            adjectives = [child for child in get_children(token) if is_adjective(child)]
            descriptors.extend(adjectives)
            if token.head.pos_ in ['AUX', 'VERB']:
                descriptors.extend([child for child in get_children(token.head) if is_adjective(child)])
            descriptors = list(set(descriptors))
            subjects_descriptors[token] = descriptors
        elif token.pos_ in ['NOUN', 'PROPN']:
            subjects_descriptors[token] = [child for child in get_children(token) if is_adjective(child)]
        else:
            continue
    return subjects_descriptors



Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 62ms/step - accuracy: 0.6159 - loss: 0.9523 - val_accuracy: 0.6271 - val_loss: 0.8310
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.6836 - loss: 0.7047 - val_accuracy: 0.6839 - val_loss: 0.7282
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 47ms/step - accuracy: 0.8158 - loss: 0.4605 - val_accuracy: 0.6806 - val_loss: 0.7398
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.8861 - loss: 0.3139 - val_accuracy: 0.6940 - val_loss: 0.7991
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.9223 - loss: 0.2118 - val_accuracy: 0.6906 - val_loss: 0.8507
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - accuracy: 0.9414 - loss: 0.1747 - val_accuracy: 0.6990 - val_loss: 1.0181
Epoch 7/10
[1m75/75[0m [32m━━━━

In [4]:
# Combine predictions from both models
X_stacking = np.hstack((model.predict(X_lstm), lr.predict_proba(X_lda)))

# Train a model on top of the combined predictions
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_stacking, y)
y_pred_stacking = clf.predict(X_stacking)
print("Accuracy of Stacked Model:", accuracy_score(y, y_pred_stacking))
print("Classification Report of Stacked Model:")
print(classification_report(y, y_pred_stacking))

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
Accuracy of Stacked Model: 0.9588491134158581
Classification Report of Stacked Model:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       715
           1       0.96      0.94      0.95       398
           2       0.98      0.97      0.97      1876

   micro avg       0.96      0.96      0.96      2989
   macro avg       0.96      0.95      0.95      2989
weighted avg       0.96      0.96      0.96      2989
 samples avg       0.96      0.96      0.96      2989



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
def analyze_review(review):
    processed_review = preprocess_text(review)
    sentiment = get_sentiment(review)
    descriptors = get_descriptors(review)
    subjects = sorted(set(descriptors.keys()))
    adjectives = sorted(set(chain.from_iterable(descriptors.values())))
    return {
        'review': review,
        'aspect': subjects,
        'descriptors': adjectives,
        'sentiment': sentiment
    }

In [9]:
import tkinter as tk
from tkinter import scrolledtext

# Create the main application window
root = tk.Tk()
root.title("Review Analyzer")

# Create and place the input text box
input_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=50, height=10)
input_text.grid(column=0, row=0, padx=10, pady=10)

# Create and place the analyze button
def on_analyze():
    review = input_text.get("1.0", tk.END).strip()
    if review:
        analysis_result = analyze_review(review)
        result_text.set(f"Review: {analysis_result['review']}\n"
                        f"Aspect: {[str(subject) for subject in analysis_result['aspect']]}\n"
                        f"Descriptors: {[str(descriptor) for descriptor in analysis_result['descriptors']]}\n"
                        f"Sentiment: {analysis_result['sentiment']}\n")

analyze_button = tk.Button(root, text="Analyze Review", command=on_analyze)
analyze_button.grid(column=0, row=1, padx=10, pady=10)

# Create and place the result label
result_text = tk.StringVar()
result_label = tk.Label(root, textvariable=result_text, justify=tk.LEFT, wraplength=400)
result_label.grid(column=0, row=2, padx=10, pady=10)

# Run the application
root.mainloop()