In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [None]:
# NLTK datasets
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

def preprocess_data(filepath="Hotel_Reviews.csv", test_size=0.2, random_state=42):
    """
    Loads, cleans, and splits the data.
    """
    # Read data
    df = pd.read_csv(filepath)

    # Combine positive and negative reviews into one column
    # Casting to string to avoid issues in case of missing or non-string data
    df["Positive_Review"] = df["Positive_Review"].astype(str)
    df["Negative_Review"] = df["Negative_Review"].astype(str)
    df["reviews"] = df["Positive_Review"] + " " + df["Negative_Review"]
    
    # Label reviews: 0 for good (score > 5), 1 for bad reviews (score <= 5)
    df["Bad_reviews"] = df["Reviewer_Score"].apply(lambda x: 0 if x > 5 else 1)
    
    # Keep only the relevant columns
    df = df[["reviews", "Bad_reviews"]]
    
    # Remove placeholders in combined reviews
    df["reviews"] = (
        df["reviews"]
        .str.replace("No Negative", "", regex=False)
        .str.replace("No Positive", "", regex=False)
    )
    
    # Remove missing and duplicate reviews
    df.dropna(subset=["reviews"], inplace=True)
    df.drop_duplicates(subset=["reviews"], inplace=True)
    
    # For EDA: Print distribution and review length statistics
    print("Sentiment Label Distribution:")
    print(df["Bad_reviews"].value_counts())
    
    df["review_length"] = df["reviews"].apply(len)
    print("\nReview Length Statistics:")
    print(df["review_length"].describe())

    def clean_text(text):
        """Cleans and lemmatizes text."""
        # Keep certain negation words
        keep_words = {"not", "no", "never"}
        
        # Lowercase and remove non-alphabetic characters
        text = text.lower()
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        
        # Tokenize
        words = word_tokenize(text)
        
        # Remove stopwords but preserve negation words
        stop_words = set(stopwords.words("english"))
        filtered_words = [w for w in words if (w not in stop_words) or (w in keep_words)]
        
        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
        
        return " ".join(lemmatized_words)
    
    # Clean the reviews
    df["Reviews_clean"] = df["reviews"].apply(clean_text)
    
    # Split into train and test
    train_df, test_df = train_test_split(
        df, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=df["Bad_reviews"]
    )
    
    return train_df, test_df


train_df, test_df = preprocess_data("Hotel_Reviews.csv")

In [None]:
# -----------------------------------------------------------------------------
# Part A: TF-IDF Based Models (Logistic Regression, SVM, Random Forest)
# -----------------------------------------------------------------------------

# Rebuild TF-IDF features using the cleaned reviews
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df["Reviews_clean"])
X_test_tfidf = tfidf_vectorizer.transform(test_df["Reviews_clean"])
y_train = train_df["Bad_reviews"]
y_test = test_df["Bad_reviews"]

# Define a function for evaluation
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    print(f"--- {model_name} Evaluation ---")
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:   ", recall_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

# 1. Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
evaluate_model(lr_model, X_test_tfidf, y_test, "Logistic Regression (TF-IDF)")

# 2. Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
evaluate_model(svm_model, X_test_tfidf, y_test, "SVM (TF-IDF)")

# 3. Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
evaluate_model(rf_model, X_test_tfidf, y_test, "Random Forest (TF-IDF)")

--- Logistic Regression (TF-IDF) Evaluation ---
Accuracy:  0.9487225710823344
Precision: 0.6847555923777962
Recall:    0.2749500998003992
F1 Score:  0.3923569902682174
Confusion Matrix:
[[93076   761]
 [ 4359  1653]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     93837
           1       0.68      0.27      0.39      6012

    accuracy                           0.95     99849
   macro avg       0.82      0.63      0.68     99849
weighted avg       0.94      0.95      0.94     99849





In [None]:
# -----------------------------------------------------------------------------
# Part B: LSTM Model Using Word Embeddings
# -----------------------------------------------------------------------------

# For the LSTM, work with the cleaned text directly.
# Set hyperparameters for tokenization and sequence processing.
max_vocab = 10000
max_len = 100  # maximum review length in terms of tokens
embedding_dim = 100

# Tokenize the text
tokenizer = Tokenizer(num_words=max_vocab, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["Reviews_clean"])

# Convert texts to sequences and pad them
X_train_seq = tokenizer.texts_to_sequences(train_df["Reviews_clean"])
X_test_seq = tokenizer.texts_to_sequences(test_df["Reviews_clean"])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Build the LSTM model
lstm_model = Sequential([
    Embedding(input_dim=max_vocab, output_dim=embedding_dim, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Train the LSTM model
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)

# Evaluate the LSTM model
lstm_loss, lstm_acc = lstm_model.evaluate(X_test_pad, y_test, verbose=0)
print("--- LSTM (Word Embeddings) Evaluation ---")
print("Accuracy: ", lstm_acc)

In [None]:
# evaluation 
y_pred_lstm_prob = lstm_model.predict(X_test_pad)
y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)
print("Precision:", precision_score(y_test, y_pred_lstm))
print("Recall:   ", recall_score(y_test, y_pred_lstm))
print("F1 Score: ", f1_score(y_test, y_pred_lstm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lstm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lstm))