In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from tqdm import tqdm

In [2]:
# --- Setup paths ---
current_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))  # Adjust to your project structure
current_dir

'/root/cmpe256/cmpe256_hotel_recommendation_system'

In [3]:
input_path = os.path.join(current_dir, 'data', 'processed', 'hotelrec_2013_2017_cleaned_encoded.csv.gz')

In [4]:
chunk_size = 1_000_000  # adjust if still too big

# Columns to use
columns_to_use = [
    'hotel_name_id', 'author_id', 'sentiment_score',
    'sleep quality', 'value', 'rooms', 'service', 'cleanliness', 'location',
    'rating'
]

In [5]:
# --- Initialize storage for results ---
all_rmse = []
all_mae = []

# --- Process in Chunks ---
for chunk_idx, chunk in enumerate(pd.read_csv(input_path, usecols=columns_to_use, chunksize=chunk_size)):
    print(f"\nProcessing chunk {chunk_idx+1}...")

    # --- Fill missing values ---
    structured_features = ['sleep quality', 'value', 'rooms', 'service', 'cleanliness', 'location']
    chunk[structured_features] = chunk[structured_features].fillna(chunk[structured_features].mean())
    
    # Drop rows where rating is still NaN (shouldn't happen, but safety)
    chunk = chunk.dropna(subset=['rating'])

    # --- Prepare features ---
    X = chunk[['hotel_name_id', 'author_id', 'sentiment_score'] + structured_features]
    y = chunk['rating']

    # --- Train-Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- Train model ---
    model = lgb.LGBMRegressor(
        objective='regression',
        n_estimators=10000,
        learning_rate=0.01,
        num_leaves=31,
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(100)
        ]
    )

    # --- Predict ---
    y_pred = model.predict(X_test)

    # --- Evaluate ---
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"Chunk {chunk_idx+1} - RMSE: {rmse:.4f}, MAE: {mae:.4f}")

    all_rmse.append(rmse)
    all_mae.append(mae)

    if chunk_idx == 0:
        import joblib
        model_save_path = os.path.join(current_dir, 'models', 'hybrid_model.pkl')
        joblib.dump(model, model_save_path)
        print(f"Aspect-based Sentiment Aware model saved to: {model_save_path}")


Processing chunk 1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 807
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 9
[LightGBM] [Info] Start training from score 4.183301
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 0.456226
[200]	valid_0's l2: 0.339854
[300]	valid_0's l2: 0.316895
[400]	valid_0's l2: 0.310449
[500]	valid_0's l2: 0.308001
[600]	valid_0's l2: 0.306491
[700]	valid_0's l2: 0.305506
[800]	valid_0's l2: 0.304722
[900]	valid_0's l2: 0.304144
[1000]	valid_0's l2: 0.303651
[1100]	valid_0's l2: 0.303247
[1200]	valid_0's l2: 0.302953
[1300]	valid_0's l2: 0.302714
[1400]	valid_0's l2: 0.302493
[1500]	valid_0's l2: 0.302308
[1600]	valid_0's l2: 0.302151
[1700]	valid_0's l2: 0.30202
[1800]	vali

In [None]:
# --- Final Output ---
print("\n=== Aspect-based Sentiment Aware Model Result ===")
print(f"Average RMSE: {sum(all_rmse) / len(all_rmse):.4f}")
print(f"Average MAE:  {sum(all_mae) / len(all_mae):.4f}")

In [6]:
# --- Final Output ---
print("\n=== Aspect-based Sentiment Aware Model Result ===")
print(f"Average RMSE: {sum(all_rmse) / len(all_rmse):.4f}")
print(f"Average MAE:  {sum(all_mae) / len(all_mae):.4f}")


=== Aspect-based Sentiment Aware Model Result ===
Average RMSE: 0.5545
Average MAE:  0.3862
