In [None]:
### Importing Libraries

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
nltk.download('punkt_tab', quiet=True) # Download Punkt tab 
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

In [None]:
### Preprocessing Function
def preprocess_data(filepath="Hotel_Reviews.csv", test_size=0.2, random_state=42):
    df = pd.read_csv(filepath)
    df["Positive_Review"] = df["Positive_Review"].astype(str)
    df["Negative_Review"] = df["Negative_Review"].astype(str)
    df["reviews"] = df["Positive_Review"] + " " + df["Negative_Review"]
    df["Bad_reviews"] = df["Reviewer_Score"].apply(lambda x: 0 if x > 5 else 1)
    df = df[["reviews", "Bad_reviews"]]
    df["reviews"] = (
        df["reviews"]
        .str.replace("No Negative", "", regex=False)
        .str.replace("No Positive", "", regex=False)
    )
    df.dropna(subset=["reviews"], inplace=True)
    df.drop_duplicates(subset=["reviews"], inplace=True)
    
    def clean_text(text):
        keep_words = {"not", "no", "never"}
        text = text.lower()
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        words = word_tokenize(text)
        stop_words = set(stopwords.words("english"))
        filtered_words = [w for w in words if (w not in stop_words) or (w in keep_words)]
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
        return " ".join(lemmatized_words)
    
    df["Reviews_clean"] = df["reviews"].apply(clean_text)
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df["Bad_reviews"])
    return train_df, test_df

In [None]:
### Load and Preprocess Data
train_df, test_df = preprocess_data("Hotel_Reviews.csv")

In [None]:
### TF-IDF Vectorization and Logistic Regression
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df["Reviews_clean"])
X_test_tfidf = tfidf_vectorizer.transform(test_df["Reviews_clean"])
y_train = train_df["Bad_reviews"]
y_test = test_df["Bad_reviews"]

In [None]:
### Read data
df = pd.read_csv("Hotel_Reviews.csv")
df.head()

In [None]:
### Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
evaluate_model(lr_model, X_test_tfidf, y_test, "Logistic Regression (TF-IDF)")

In [None]:
### Train and Evaluate Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
evaluate_model(nb_model, X_test_tfidf, y_test, "Naive Bayes (TF-IDF)")

In [None]:
### Train and Evaluate Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)
evaluate_model(dt_model, X_test_tfidf, y_test, "Decision Tree (TF-IDF)")

In [None]:
### LSTM Model
max_vocab = 10000
max_len = 100
embedding_dim = 100

tokenizer = Tokenizer(num_words=max_vocab, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["Reviews_clean"])

X_train_seq = tokenizer.texts_to_sequences(train_df["Reviews_clean"])
X_test_seq = tokenizer.texts_to_sequences(test_df["Reviews_clean"])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [None]:
### Build and Train LSTM Model
lstm_model = Sequential([
    Embedding(input_dim=max_vocab, output_dim=embedding_dim, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

In [None]:
### Evaluate LSTM Model
lstm_loss, lstm_acc = lstm_model.evaluate(X_test_pad, y_test, verbose=0)
print("--- LSTM (Word Embeddings) Evaluation ---")
print("Accuracy: ", lstm_acc)
y_pred_lstm_prob = lstm_model.predict(X_test_pad)
y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)
print("Precision:", precision_score(y_test, y_pred_lstm))
print("Recall:   ", recall_score(y_test, y_pred_lstm))
print("F1 Score: ", f1_score(y_test, y_pred_lstm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lstm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lstm))