# Sentiment-aware model

## Setup

In [4]:
# --- Imports ---
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
import tqdm
import joblib

In [None]:
current_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
current_dir

'/root/cmpe256/cmpe256_hotel_recommendation_system'

In [3]:
file_path = os.path.join(current_dir, 'data', 'processed', 'hotelrec_2013_2017_cleaned.csv.gz')
file_path

'/root/cmpe256/cmpe256_hotel_recommendation_system/data/processed/hotelrec_2013_2017_cleaned.csv.gz'

## Encoding Categorical Features (can skip if already done)

In [7]:
# Setup paths ---
output_path = os.path.join(current_dir, 'data', 'processed', 'hotelrec_2013_2017_cleaned_encoded.csv.gz')
chunk_size = 1_000_000

In [8]:
# ---  Fit encoders ---
# Load only hotel_name and author
full_df = pd.read_csv(file_path, usecols=['hotel_name', 'author'])

In [9]:
# Initialize encoders
hotel_name_encoder = LabelEncoder()
author_encoder = LabelEncoder()

In [10]:
hotel_name_encoder.fit(full_df['hotel_name'].astype(str))
author_encoder.fit(full_df['author'].astype(str))

# Clear memory
del full_df

In [11]:
# If output file exists, remove it first
if os.path.exists(output_path):
    os.remove(output_path)

In [None]:
first_chunk = True      # [~13 min]

for chunk in tqdm.tqdm(pd.read_csv(file_path, chunksize=chunk_size), desc="Processing chunks"):
    # Encode hotel_name and author
    chunk['hotel_name_id'] = hotel_name_encoder.transform(chunk['hotel_name'].astype(str))
    chunk['author_id'] = author_encoder.transform(chunk['author'].astype(str))

    # Save the chunk
    if first_chunk:
        chunk.to_csv(output_path, index=False, mode='w', compression='gzip')
        first_chunk = False
    else:
        chunk.to_csv(output_path, index=False, header=False, mode='a', compression='gzip')

Processing chunks: 33it [12:52, 23.42s/it]


## Model Training

In [5]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from tqdm import tqdm

In [6]:
# --- Load Data ---
input_path = os.path.join(current_dir, 'data', 'processed', 'hotelrec_2013_2017_cleaned_encoded.csv.gz')

In [7]:
# Read needed columns
df = pd.read_csv(input_path, usecols=['hotel_name_id', 'author_id', 'rating', 'sentiment_score'])

In [8]:
# --- Prepare features and target ---
X = df[['hotel_name_id', 'author_id', 'sentiment_score']]
y = df['rating']

In [9]:
# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# --- LightGBM Regressor ---
model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

In [None]:
# Train [~2min]
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(100)]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 26314009, number of used features: 3
[LightGBM] [Info] Start training from score 4.164375
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 0.571572
[200]	valid_0's l2: 0.569183


In [None]:
# --- Predict ---
y_pred = model.predict(X_test)

In [None]:
# --- Evaluate ---
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print("=== Sentiment-Aware Model Results ===")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")


=== Sentiment-Aware Model Results ===
RMSE: 0.7533
MAE:  0.5912


: 

In [None]:
# --- Save model ---
model_save_path = os.path.join(current_dir, 'models', 'sentiment_model.pkl')
joblib.dump(model, model_save_path)