In [1]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') # Tắt warning cho gọn

# Setup Path
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    project_root = current_dir

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import Modules
from src.data_loader import TMDbDataLoader
from src.preprocessing import DataPreprocessor        # V1
from src.preprocessing_v2 import DataPreprocessorV2   # V2 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import joblib

print(f"Project Root: {project_root}")

Project Root: d:\TranQuangSang\HCMUS\senior\Pythonfordatascience\python-final-project-ds


In [2]:
config_path = project_root / "configs" / "config.yaml"
raw_data_path = project_root / "data" / "raw" / "movies_2010_2024.csv"

loader = TMDbDataLoader(config_path=str(config_path))

if not raw_data_path.exists():
    print("Fetching data from API...")
    loader.fetch_data()
    loader.save_data(str(raw_data_path))
else:
    print("Loading existing data...")

df_raw = loader.load_data(str(raw_data_path))
print(f"Data Shape: {df_raw.shape}")

Loading existing data...
Data Shape: (2581, 14)


In [3]:
def run_experiment(name, preprocessor, model, df):
    print(f"\n{'='*20} RUNNING: {name} {'='*20}")
    
    # 2. Split
    train_df, test_df = train_test_split(df , test_size=0.2, random_state=42)

    # Reset index để tránh bug
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    X_train, y_train_log = preprocessor.fit_transform(train_df)

    X_test, y_test_log = preprocessor.transform(test_df)

    # 3. Train
    print("Training Model...")
    model.fit(X_train, y_train_log)
    
    # 4. Evaluate
    y_pred_log = model.predict(X_test)
    
    y_test = preprocessor.inverse_transform_target(y_test_log)
    y_pred = preprocessor.inverse_transform_target(y_pred_log)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"{name} Results:")
    print(f"   R2 Score: {r2:.4f}")
    print(f"   MAE:      ${mae:,.2f}")
    print(f"   RMSE:     ${rmse:,.0f}")
    
    return {
        "Experiment": name,
        "R2": r2,
        "MAE": mae,
        "Features": X_train.shape[1],
        "Test_Samples": len(y_test),
        "Model": model,
        "Preprocessor": preprocessor,
    }

In [4]:
# V1: Basic Preprocessing + RandomForest
prep_v1 = DataPreprocessor(config_path=str(config_path))
model_v1 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

res_v1 = run_experiment("V1 (Basic)", prep_v1, model_v1, df_raw)


Training Model...
V1 (Basic) Results:
   R2 Score: 0.7666
   MAE:      $51,507,382.56
   RMSE:     $101,475,036


In [5]:
# V2: Advanced Preprocessing (KNN, RobustScaler, BGE) + XGBoost (Mạnh hơn RF)
prep_v2 = DataPreprocessorV2(config_path=str(config_path))
# Sử dụng XGBoost cho V2 vì nó thường handle features tốt hơn RF
model_v2 = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=42)

res_v2 = run_experiment("V2 (Advanced)", prep_v2, model_v2, df_raw)


Training Model...
V2 (Advanced) Results:
   R2 Score: 0.7258
   MAE:      $49,790,823.05
   RMSE:     $109,999,457


In [6]:
# Tổng hợp kết quả
df_results = pd.DataFrame([res_v1, res_v2])
cols = ['Experiment', 'R2', 'MAE', 'Features']
print("\nLEADERBOARD")
print(df_results[cols])

# Lưu mô hình chiến thắng (V2)
print("\nSaving Best Model (V2)...")
models_dir = project_root / "models"
models_dir.mkdir(parents=True, exist_ok=True)

# Lưu model
joblib.dump(res_v2['Model'], models_dir / "best_model.pkl")
# Lưu preprocessor
res_v2['Preprocessor'].save_preprocessor(str(models_dir / "preprocessor.pkl"))

print("Done! Model saved to 'models/best_model.pkl'")


LEADERBOARD
      Experiment        R2           MAE  Features
0     V1 (Basic)  0.766612  5.150738e+07        65
1  V2 (Advanced)  0.725753  4.979082e+07       419

Saving Best Model (V2)...
Done! Model saved to 'models/best_model.pkl'
