In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# NLTK datasets
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [None]:
# Read data
df = pd.read_csv("Hotel_Reviews.csv")
df.head()

In [3]:
# Combine positive and negative reviews into one column
# Casting to string to avoid issues in case of missing or non-string data
df["Positive_Review"] = df["Positive_Review"].astype(str)
df["Negative_Review"] = df["Negative_Review"].astype(str)
df["reviews"] = df["Positive_Review"] + " " + df["Negative_Review"]
df.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng,reviews
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968,Only the park outside of the hotel was beauti...
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968,No real complaints the hotel was great great ...
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968,Location was good and staff were ok It is cut...
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968,Great location in nice surroundings the bar a...
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968,Amazing location and building Romantic settin...


In [21]:
# Label reviews: 0 for good (score > 5), 1 for bad reviews (score <= 5)
df["Bad_reviews"] = df["Reviewer_Score"].apply(lambda x: 0 if x > 5 else 1)
# Keep only the relevant columns
df = df[["reviews", "Bad_reviews"]]
df.head()

Unnamed: 0,reviews,Bad_reviews
0,Only the park outside of the hotel was beauti...,1
1,No real complaints the hotel was great great ...,0
2,Location was good and staff were ok It is cut...,0
3,Great location in nice surroundings the bar a...,1
4,Amazing location and building Romantic settin...,0


In [22]:
# Remove placeholders in combined reviews
df["reviews"] = (
    df["reviews"]
    .str.replace("No Negative", "", regex=False)
    .str.replace("No Positive", "", regex=False)
)

In [23]:
# Remove missing and duplicate reviews
df.dropna(subset=["reviews"], inplace=True)
df.drop_duplicates(subset=["reviews"], inplace=True)

In [None]:
# Function for basic text cleaning
def clean_text(text):
    """Cleans and lemmatizes text."""
    # Keep certain negation words
    keep_words = {"not", "no", "never"}
    
    # Lowercase and remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Tokenize
    words = word_tokenize(text)
    
    # Remove stopwords but preserve negation words
    stop_words = set(stopwords.words("english"))
    filtered_words = [w for w in words if (w not in stop_words) or (w in keep_words)]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    return " ".join(lemmatized_words)

# Clean the reviews
df["Reviews_clean"] = df["reviews"].apply(clean_text)

In [25]:
# For EDA: Print distribution and review length statistics
print("Sentiment Label Distribution:")
print(df["Bad_reviews"].value_counts())

Sentiment Label Distribution:
Bad_reviews
0    469185
1     30060
Name: count, dtype: int64


In [26]:
df["review_length"] = df["reviews"].apply(len)
print("\nReview Length Statistics:")
print(df["review_length"].describe())


Review Length Statistics:
count    499245.000000
mean        191.280207
std         201.107872
min           1.000000
25%          69.000000
50%         130.000000
75%         240.000000
max        3806.000000
Name: review_length, dtype: float64


In [29]:
df["review_length"] = df["reviews"].apply(len)
print("\nReview Length Statistics:")
print(df["review_length"].describe())


Review Length Statistics:
count    499245.000000
mean        191.280207
std         201.107872
min           1.000000
25%          69.000000
50%         130.000000
75%         240.000000
max        3806.000000
Name: review_length, dtype: float64


In [30]:
test_size=0.2
random_state=42
# Split into train and test
train_df, test_df = train_test_split(
    df, 
    test_size=test_size, 
    random_state=random_state, 
    stratify=df["Bad_reviews"]
)

In [None]:
# -----------------------------------------------------------------------------
# Part A: TF-IDF Based Models (Logistic Regression, SVM, Random Forest)
# -----------------------------------------------------------------------------

# Rebuild TF-IDF features using the cleaned reviews
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df["Reviews_clean"])
X_test_tfidf = tfidf_vectorizer.transform(test_df["Reviews_clean"])
y_train = train_df["Bad_reviews"]
y_test = test_df["Bad_reviews"]

# Define a function for evaluation
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    print(f"--- {model_name} Evaluation ---")
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:   ", recall_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

# 1. Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
evaluate_model(lr_model, X_test_tfidf, y_test, "Logistic Regression (TF-IDF)")

# 2. Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
evaluate_model(svm_model, X_test_tfidf, y_test, "SVM (TF-IDF)")

# 3. Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
evaluate_model(rf_model, X_test_tfidf, y_test, "Random Forest (TF-IDF)")

--- Logistic Regression (TF-IDF) Evaluation ---
Accuracy:  0.9487225710823344
Precision: 0.6847555923777962
Recall:    0.2749500998003992
F1 Score:  0.3923569902682174
Confusion Matrix:
[[93076   761]
 [ 4359  1653]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     93837
           1       0.68      0.27      0.39      6012

    accuracy                           0.95     99849
   macro avg       0.82      0.63      0.68     99849
weighted avg       0.94      0.95      0.94     99849





In [3]:
# -----------------------------------------------------------------------------
# Part B: LSTM Model Using Word Embeddings
# -----------------------------------------------------------------------------

# For the LSTM, work with the cleaned text directly.
# Set hyperparameters for tokenization and sequence processing.
max_vocab = 10000
max_len = 100  # maximum review length in terms of tokens
embedding_dim = 100

# Tokenize the text
tokenizer = Tokenizer(num_words=max_vocab, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["Reviews_clean"])

# Convert texts to sequences and pad them
X_train_seq = tokenizer.texts_to_sequences(train_df["Reviews_clean"])
X_test_seq = tokenizer.texts_to_sequences(test_df["Reviews_clean"])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Build the LSTM model
lstm_model = Sequential([
    Embedding(input_dim=max_vocab, output_dim=embedding_dim, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Train the LSTM model
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)

# Evaluate the LSTM model
lstm_loss, lstm_acc = lstm_model.evaluate(X_test_pad, y_test, verbose=0)
print("--- LSTM (Word Embeddings) Evaluation ---")
print("Accuracy: ", lstm_acc)

NameError: name 'train_df' is not defined

In [4]:
# evaluation 
y_pred_lstm_prob = lstm_model.predict(X_test_pad)
y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)
print("Precision:", precision_score(y_test, y_pred_lstm))
print("Recall:   ", recall_score(y_test, y_pred_lstm))
print("F1 Score: ", f1_score(y_test, y_pred_lstm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lstm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lstm))

NameError: name 'lstm_model' is not defined