In [23]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') # Tắt warning cho gọn

# Setup Path
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    project_root = current_dir

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import Modules
from src.data_loader import TMDbDataLoader
from src.preprocessing import DataPreprocessor        # V1
from src.preprocessing_v2 import DataPreprocessorV2   # V2 (New)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import joblib

print(f"Project Root: {project_root}")

Project Root: d:\TranQuangSang\HCMUS\senior\Pythonfordatascience\python-final-project-ds


In [24]:
config_path = project_root / "configs" / "config.yaml"
raw_data_path = project_root / "data" / "raw" / "movies_2010_2024.csv"

loader = TMDbDataLoader(config_path=str(config_path))

if not raw_data_path.exists():
    print("Fetching data from API...")
    loader.fetch_data()
    loader.save_data(str(raw_data_path))
else:
    print("Loading existing data...")

df_raw = loader.load_data(str(raw_data_path))
print(f"Data Shape: {df_raw.shape}")

2025-12-08 16:14:15,613 - src.data_loader - INFO - TMDbDataLoader đã được khởi tạo thành công
2025-12-08 16:14:15,632 - src.data_loader - INFO - Loaded 2709 rows từ d:\TranQuangSang\HCMUS\senior\Pythonfordatascience\python-final-project-ds\data\raw\movies_2010_2024.csv


Loading existing data...
Data Shape: (2709, 14)


In [25]:
def run_experiment(name, preprocessor, model, df):
    print(f"\n{'='*20} RUNNING: {name} {'='*20}")
    
    # 2. Split
    train_df, test_df = train_test_split(df , test_size=0.2, random_state=42)

    # Reset index để tránh bug
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    X_train, y_train = preprocessor.fit_transform(train_df)

    X_test, y_test = preprocessor.transform(test_df)

    # 3. Train
    print("Training Model...")
    model.fit(X_train, y_train)
    
    # 4. Evaluate
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"{name} Results:")
    print(f"   R2 Score: {r2:.4f}")
    print(f"   MAE:      ${mae:,.2f}")
    
    return {
        "Experiment": name,
        "R2": r2,
        "MAE": mae,
        "Features": X_train.shape[1],
        "Test_Samples": len(y_test),
        "Model": model,
        "Preprocessor": preprocessor,
        "Data": (X_train, X_test, y_train, y_test) # Save data for later
    }

In [26]:
# V1: Basic Preprocessing + RandomForest
prep_v1 = DataPreprocessor(config_path=str(config_path))
model_v1 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

res_v1 = run_experiment("V1 (Basic)", prep_v1, model_v1, df_raw)

2025-12-08 16:14:15,672 - src.preprocessing - INFO - DataPreprocessor đã được khởi tạo thành công (target_col=revenue, scaler=StandardScaler)
2025-12-08 16:14:15,675 - src.preprocessing - INFO - Bắt đầu fit preprocessing pipeline...
2025-12-08 16:14:15,678 - src.preprocessing - INFO - Đã xử lý missing values
2025-12-08 16:14:15,688 - src.preprocessing - INFO - Đã loại bỏ 111 outliers (5.12%)
2025-12-08 16:14:15,694 - src.preprocessing - INFO - Đã tạo 5 date features
2025-12-08 16:14:15,700 - src.preprocessing - INFO - Đã encode 19 genres





2025-12-08 16:14:15,848 - src.preprocessing - INFO - Đã tạo 30 TF-IDF features từ overview
2025-12-08 16:14:15,851 - src.preprocessing - INFO - Đã tạo derived features
2025-12-08 16:14:15,857 - src.preprocessing - INFO - Đã fit preprocessor với 65 features
2025-12-08 16:14:15,857 - src.preprocessing - INFO - Bắt đầu transform data...
2025-12-08 16:14:15,860 - src.preprocessing - INFO - Đã xử lý missing values
2025-12-08 16:14:15,867 - src.preprocessing - INFO - Đã loại bỏ 54 outliers (2.49%)
2025-12-08 16:14:15,873 - src.preprocessing - INFO - Đã tạo 5 date features
2025-12-08 16:14:15,877 - src.preprocessing - INFO - Đã encode 19 genres
2025-12-08 16:14:15,946 - src.preprocessing - INFO - Đã tạo 30 TF-IDF features từ overview
2025-12-08 16:14:15,949 - src.preprocessing - INFO - Đã tạo derived features
2025-12-08 16:14:15,953 - src.preprocessing - INFO - Đã transform data với shape: (2113, 65)
2025-12-08 16:14:15,953 - src.preprocessing - INFO - Bắt đầu transform data...
2025-12-08 16:

Training Model...
V1 (Basic) Results:
   R2 Score: 0.6651
   MAE:      $1.03


In [27]:
# V2: Advanced Preprocessing (KNN, RobustScaler, BGE) + XGBoost (Mạnh hơn RF)
prep_v2 = DataPreprocessorV2(config_path=str(config_path))
# Sử dụng XGBoost cho V2 vì nó thường handle features tốt hơn RF
model_v2 = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=42)

res_v2 = run_experiment("V2 (Advanced)", prep_v2, model_v2, df_raw)

2025-12-08 16:14:16,503 - src.preprocessing_v2 - INFO - DataPreprocessorV2 (Advanced) đã được khởi tạo.
2025-12-08 16:14:16,508 - src.preprocessing_v2 - INFO - Bắt đầu fit DataPreprocessorV2 (Advanced)...
2025-12-08 16:14:16,677 - src.preprocessing_v2 - INFO - Đã fit V2 với 65 features.



Training Model...
V2 (Advanced) Results:
   R2 Score: 0.7604
   MAE:      $52,457,488.00


In [28]:
# Tổng hợp kết quả
df_results = pd.DataFrame([res_v1, res_v2])
cols = ['Experiment', 'R2', 'MAE', 'Features']
print("\nLEADERBOARD")
print(df_results[cols])

# Lưu mô hình chiến thắng (V2)
print("\nSaving Best Model (V2)...")
models_dir = project_root / "models"
models_dir.mkdir(parents=True, exist_ok=True)

# Lưu model
joblib.dump(res_v2['Model'], models_dir / "best_model.pkl")
# Lưu preprocessor
res_v2['Preprocessor'].save_preprocessor(str(models_dir / "preprocessor.pkl"))

print("Done! Model saved to 'models/best_model.pkl'")

2025-12-08 16:14:17,069 - src.preprocessing_v2 - INFO - Đã lưu preprocessor vào: d:\TranQuangSang\HCMUS\senior\Pythonfordatascience\python-final-project-ds\models\preprocessor.pkl



LEADERBOARD
      Experiment        R2           MAE  Features
0     V1 (Basic)  0.665122  1.030099e+00        65
1  V2 (Advanced)  0.760353  5.245749e+07        65

Saving Best Model (V2)...
Done! Model saved to 'models/best_model.pkl'
