In [1]:
import pandas as pd
import numpy as np
import os
import time
import joblib
import pickle
import lightgbm as lgb
import glob
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split

In [2]:
# --- Setup paths ---
current_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))  # Adjust to your project structure
current_dir

'/root/cmpe256/cmpe256_hotel_recommendation_system'

In [3]:
# --- Load data ---
input_path = os.path.join(current_dir, 'data', 'processed', 'hotelrec_2013_2017_cleaned_encoded.csv.gz')

df = pd.read_csv(input_path, usecols=[
    'author_id', 'hotel_name_id', 'rating', 'sentiment_score',
    'sleep quality', 'value', 'rooms', 'service', 'cleanliness', 'location'
])

chunk_size = 500_000
svd_sample_size = 100_000
n_factors = 20

In [4]:
# === Step 1: Train SVD on a sample ===
print("Sampling data for SVD...")
sample_df = pd.read_csv(input_path, usecols=['author_id', 'hotel_name_id', 'rating'], nrows=svd_sample_size)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(sample_df, reader)
trainset, _ = surprise_train_test_split(data, test_size=0.2)

Sampling data for SVD...


In [5]:
svd = SVD(n_factors=n_factors)
print("Training SVD on sample...")
svd.fit(trainset)

Training SVD on sample...


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fad9e338130>

In [7]:
# === Step 2: Extract user/hotel embeddings from SVD ===
# print("Creating embedding lookup tables...")
# user_embeddings = {}
# item_embeddings = {}

# for uid in sample_df['author_id'].unique():
#     try:
#         user_embeddings[uid] = svd.pu[trainset.to_inner_uid(uid)]
#     except ValueError:
#         user_embeddings[uid] = np.zeros(n_factors)

# for iid in sample_df['hotel_name_id'].unique():
#     try:
#         item_embeddings[iid] = svd.qi[trainset.to_inner_iid(iid)]
#     except ValueError:
#         item_embeddings[iid] = np.zeros(n_factors)
print("Generating embeddings...")
user_embeddings = {}
for uid in sample_df['author_id'].unique():
    try:
        user_embeddings[uid] = svd.pu[trainset.to_inner_uid(uid)]
    except ValueError:
        user_embeddings[uid] = np.zeros(n_factors)

item_embeddings = {}
for iid in sample_df['hotel_name_id'].unique():
    try:
        item_embeddings[iid] = svd.qi[trainset.to_inner_iid(iid)]
    except ValueError:
        item_embeddings[iid] = np.zeros(n_factors)

Generating embeddings...


In [8]:
# === Step 3: Prepare to process full dataset in chunks ===
print("Processing chunks and saving to disk...")
columns = ['author_id', 'hotel_name_id', 'rating', 'sentiment_score',
           'sleep quality', 'value', 'rooms', 'service', 'cleanliness', 'location']

reader = pd.read_csv(input_path, usecols=columns, chunksize=chunk_size)

Processing chunks and saving to disk...


In [10]:
chunk_dir = os.path.join(current_dir, 'data/processed/chunks')
# === Step 4: Process full dataset in chunks and write to disk ===
for i, chunk in enumerate(tqdm(reader, desc="Processing chunks")):
    for j in range(n_factors):
        chunk[f'user_emb_{j}'] = chunk['author_id'].map(lambda x: user_embeddings.get(x, np.zeros(n_factors))[j])
        chunk[f'hotel_emb_{j}'] = chunk['hotel_name_id'].map(lambda x: item_embeddings.get(x, np.zeros(n_factors))[j])

    user_emb_cols = [f'user_emb_{j}' for j in range(n_factors)]
    hotel_emb_cols = [f'hotel_emb_{j}' for j in range(n_factors)]
    structured = ['sleep quality', 'value', 'rooms', 'service', 'cleanliness', 'location']
    features = ['rating', 'sentiment_score'] + structured + user_emb_cols + hotel_emb_cols

    chunk_path = os.path.join(chunk_dir, f'chunk_{i:03d}.csv.gz')
    chunk[features].to_csv(chunk_path, index=False, compression='gzip')

print("All chunks processed and saved.")

Processing chunks: 65it [12:28, 11.52s/it]

All chunks processed and saved.





In [11]:
# === Step 5: Load processed data and train LightGBM ===
print("Loading first chunk for model training...")
first_chunk_path = os.path.join(chunk_dir, 'chunk_000.csv.gz')
df_train = pd.read_csv(first_chunk_path, compression='gzip')
X_train = df_train.drop(columns='rating')
y_train = df_train['rating']

Loading first chunk for model training...


In [12]:
train_data = lgb.Dataset(X_train, label=y_train)

params = {
    'objective': 'regression',
    'learning_rate': 0.01,
    'n_estimators': 2000,
    'verbosity': 1
}

In [13]:
print("Training LightGBM...")
start = time.time()
model = lgb.train(params, train_data)
elapsed = time.time() - start
print(f"Training completed in {elapsed/60:.2f} minutes")

Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013597 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 297
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 7
[LightGBM] [Info] Start training from score 4.158654
Training completed in 0.17 minutes


In [14]:
print("Evaluating on training chunk...")
y_pred_train = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print("\nTraining Performance Metrics:")
print(f"RMSE: {rmse_train:.4f}")
print(f"MAE:  {mae_train:.4f}")
print(f"R²:   {r2_train:.4f}")

Evaluating on training chunk...

Training Performance Metrics:
RMSE: 0.5526
MAE:  0.3848
R²:   0.7510


In [15]:
print("\nComputing overall metrics across all chunks...")
chunk_files = sorted(glob.glob(os.path.join(chunk_dir, 'chunk_*.csv.gz')))
y_true_all = []
y_pred_all = []


Computing overall metrics across all chunks...


In [16]:
for path in tqdm(chunk_files, desc="Evaluating chunks"):
    df = pd.read_csv(path, compression='gzip')
    X = df.drop(columns='rating')
    y = df['rating']
    y_pred = model.predict(X)
    y_true_all.extend(y)
    y_pred_all.extend(y_pred)

Evaluating chunks: 100%|██████████| 66/66 [03:39<00:00,  3.32s/it]


In [17]:
y_true_all = np.array(y_true_all)
y_pred_all = np.array(y_pred_all)

rmse = mean_squared_error(y_true_all, y_pred_all, squared=False)
mae = mean_absolute_error(y_true_all, y_pred_all)
r2 = r2_score(y_true_all, y_pred_all)

In [18]:
print("\nOverall Model Performance on Full Dataset:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")


Overall Model Performance on Full Dataset:
RMSE: 0.5598
MAE:  0.3906
R²:   0.7420


In [19]:
# === STEP 7: Save model and embeddings ===
print("\nSaving model and embeddings...")
model_dir = os.path.join(current_dir, 'models')
os.makedirs(model_dir, exist_ok=True)
joblib.dump(model, os.path.join(model_dir, 'hybrid_cf_lightgbm_model.pkl'))

with open(os.path.join(model_dir, 'user_embeddings.pkl'), 'wb') as f:
    pickle.dump(user_embeddings, f)
with open(os.path.join(model_dir, 'hotel_embeddings.pkl'), 'wb') as f:
    pickle.dump(item_embeddings, f)

print("Done. Model and metrics ready.")


Saving model and embeddings...
Done. Model and metrics ready.
