In [1]:
import os
from datetime import datetime, timedelta
from typing import Dict, List

import mlflow
import pandas as pd
import psycopg2

import ray
from ray import tune
from ray.train.xgboost import XGBoostTrainer

ray.init("ray://10.200.2.51:10001", namespace="experiment-1", log_to_driver=False)

2025-04-16 20:45:12,424	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.


RuntimeError: Version mismatch: The cluster was started with:
    Ray: 2.40.0
    Python: 3.9.20
This process on Ray Client was started with:
    Ray: 2.44.1
    Python: 3.9.21


In [None]:
!pip show ray

Name: ray
Version: 2.40.0
Summary: Ray provides a simple, universal API for building distributed applications.
Home-page: https://github.com/ray-project/ray
Author: Ray Team
Author-email: ray-dev@googlegroups.com
License: Apache 2.0
Location: /root/miniconda3/envs/py3920/lib/python3.9/site-packages
Requires: aiosignal, click, filelock, frozenlist, jsonschema, msgpack, packaging, protobuf, pyyaml, requests
Required-by: 


In [None]:
# Training Configuration
TRAINING_CONFIG = {
    "model_path": "/root/thu/BigDataProject/tmp/ray/ray_results", #"model-checkpoints/final-model/xgb_model",
    "test_size": 0.3,
    "num_workers": 1,
    "resources_per_worker": {"CPU": 1},  # 4
    "use_gpu": False,
    "num_boost_round": 1,
}

# XGBoost Parameters
XGBOOST_PARAMS = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error", "rmse", "mae", "auc"],
    "tree_method": "hist",
    "max_depth": 3,  # Set fixed value for max_depth
    "eta": 0.3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

# Feature Configuration
FEATURE_COLUMNS = [
    "brand",
    "price",
    "event_weekday",
    "category_code_level1",
    "category_code_level2",
    "activity_count",
    "is_purchased",
]

CATEGORICAL_COLUMNS = [
    "brand",
    "event_weekday",
    "category_code_level1",
    "category_code_level2",
]

# Model Configuration
MODEL_NAME = "purchase_prediction_model"

In [None]:
!pip -q install IPython

[0m

In [None]:
ray.shutdown()
ray.init()

2025-04-16 16:06:36,495	INFO worker.py:1812 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


0,1
Python version:,3.9.20
Ray version:,2.40.0
Dashboard:,http://127.0.0.1:8266


[36m(XGBoostTrainer pid=981964)[0m Started distributed worker processes: 
[36m(XGBoostTrainer pid=981964)[0m - (node_id=581f1d76228192da9cb69a88ab0bf9e3a003269de5b003767de24040, ip=10.200.2.51, pid=982076) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=982076)[0m [16:06:56] Task [xgboost.ray-rank=00000000]:86c58a5299b6e5f1a4cb8f4501000000 got rank 0
[36m(SplitCoordinator pid=982148)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-04-16_16-06-34_502954_969871/logs/ray-data
[36m(SplitCoordinator pid=982148)[0m Execution plan of Dataset: InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(XGBoostTrainer pid=983742)[0m Started distributed worker processes: 
[36m(XGBoostTrainer pid=983742)[0m - (node_id=581f1d76228192da9cb69a88ab0bf9e3a003269de5b003767de24040, ip=10.200.2.51, pid=983819) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=983819)[0m [16:09:05] Task [xgboost.ray-rank=00000000]:d0630f85dae27

In [None]:
import xgboost as xgb 

In [None]:
class ModelPipeline:
    def __init__(self):        
        # Kết nối đến PostgreSQL
        self.postgres_conn = psycopg2.connect(
            dbname="airflow",
            user="airflow",
            password="airflow",
            host="10.200.2.51",
            port="5432"
        )
    
        mlflow.set_tracking_uri("http://10.200.2.51:5001")
    
    def filter_features(self, df: pd.DataFrame) -> pd.DataFrame:
        columns_to_exclude = ["event_timestamp", "user_id", "product_id", "user_session", "is_purchased"]
        feature_columns = [col for col in df.columns if col not in columns_to_exclude]
        return df[feature_columns + ["is_purchased"]]

    def load_training_data(self) -> Dict[str, List[Dict]]:
        # Truy vấn và đọc dữ liệu từ cơ sở dữ liệu
        query = """
        SELECT
            CAST(event_timestamp AS text) AS event_timestamp,
            user_id,
            product_id,
            user_session,
            price,
            brand,
            category_code_level1,
            category_code_level2,
            event_weekday,
            activity_count,
            is_purchased
        FROM processed_data
        """

        df = pd.read_sql(query, self.postgres_conn)
        self.postgres_conn.close()

        df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], errors='coerce')
        df["price"] = df["price"].astype(float)

        # Create category mappings
        category_mappings = {}
        for col in CATEGORICAL_COLUMNS:
            unique_values = df[col].dropna().unique()
            category_mapping = {val: idx for idx, val in enumerate(sorted(unique_values))}
            category_mappings[col] = category_mapping
            df[col] = df[col].map(category_mapping).fillna(-1)

        return {
            "data": df.to_dict(orient="records"),
            "category_mappings": category_mappings,
        }

    def cu_train_final_model(self, data: dict) -> dict:
        # Huấn luyện mô hình XGBoost với tham số cố định
        experiment_name = f"xgb_final_{datetime.now().strftime('%Y%m%d%H%M%S')}"
        mlflow.create_experiment(experiment_name)

        with mlflow.start_run() as run:
            # Lấy dữ liệu đã lọc
            df = pd.DataFrame(data["data"])
            filtered_df = self.filter_features(df)
            dataset = ray.data.from_pandas(filtered_df)

            # Huấn luyện mô hình
            trainer = XGBoostTrainer(
                label_column="is_purchased",
                num_boost_round=TRAINING_CONFIG["num_boost_round"],
                params=XGBOOST_PARAMS,
                datasets={"train": dataset},
            )
            
            result = trainer.fit()

            # Log model và metrics vào MLflow
            mlflow.xgboost.log_model(
                result.checkpoint.get_model(),
                "model",
                registered_model_name=MODEL_NAME,
            )

            if "category_mappings" in data:
                mlflow.log_dict(data["category_mappings"], "category_mappings.json")

            return {
                "metrics": result.metrics,
                "checkpoint_path": result.checkpoint.path,
                "mlflow_run_id": run.info.run_id,
                "mlflow_model_uri": f"models:/{MODEL_NAME}/Staging",
            }
            
    def train_final_model(self, data: dict) -> dict:
        # Huấn luyện mô hình XGBoost với tham số cố định
        experiment_name = f"xgb_final_{datetime.now().strftime('%Y%m%d%H%M%S')}"
        mlflow.create_experiment(experiment_name)

        with mlflow.start_run() as run:
            # Lấy dữ liệu đã lọc
            df = pd.DataFrame(data["data"])
            filtered_df = self.filter_features(df)
            dataset = ray.data.from_pandas(filtered_df)

            # Huấn luyện mô hình
            trainer = XGBoostTrainer(
                label_column="is_purchased",
                num_boost_round=TRAINING_CONFIG["num_boost_round"],
                params=XGBOOST_PARAMS,
                datasets={"train": dataset},
            )

            result = trainer.fit()

            # Load model from checkpoint using XGBoost API
            best_model_path = os.path.join(result.checkpoint.path, "model.ubj")
            best_model = xgb.Booster()  # Tạo một đối tượng Booster trống
            best_model.load_model(best_model_path)  # Tải mô hình từ tệp .ubj

            # Log model and metrics to MLflow using mlflow.pyfunc.log_model
            with open("requirements.txt", "w") as f:  # Tạo tệp requirements tạm thời
                f.write("xgboost==" + xgb.__version__)  # Ghi phiên bản XGBoost vào
            mlflow.pyfunc.log_model(
                python_model=best_model,  # Truyền trực tiếp đối tượng mô hình XGBoost
                artifact_path="model",
                registered_model_name=MODEL_NAME,
                # Thêm dependencies để đảm bảo môi trường phục vụ có XGBoost
                # Nếu bạn có các dependencies khác, hãy thêm chúng vào requirements.txt
                pip_requirements=["requirements.txt"],  
            )

            mlflow.log_metrics(result.metrics)

            if "category_mappings" in data:
                mlflow.log_dict(data["category_mappings"], "category_mappings.json")

            return {
                "metrics": result.metrics,
                "checkpoint_path": result.checkpoint.path,
                "mlflow_run_id": run.info.run_id,
                "mlflow_model_uri": f"runs:/{run.info.run_id}/model",  # corrected mlflow_model_uri
            }            

In [None]:
# Pipeline: Load dữ liệu và huấn luyện mô hình
pipeline = ModelPipeline()

In [None]:
# Load and preprocess data
data = pipeline.load_training_data()

  df = pd.read_sql(query, self.postgres_conn)
  df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], errors='coerce')


In [None]:
# Train final model
results = pipeline.train_final_model(data)

2025-04-16 20:40:37,612	ERROR services.py:1420 -- Ray Client is not connected. Please connect by calling `ray.init`.


🏃 View run delightful-stoat-274 at: http://10.200.2.51:5001/#/experiments/0/runs/f99d858790e245f796c1f7d5166b1cef
🧪 View experiment at: http://10.200.2.51:5001/#/experiments/0


AssertionError: 

In [None]:
# Kết quả
print("Training completed successfully!")
print(f"Final metrics: {results['metrics']}")
print(f"Model URI: {results['mlflow_model_uri']}")