In [1]:
# %pip -q install -r /root/thu/BigDataProject/ray/requirements.txt

In [2]:
# !pip install numpy==1.24.4 pandas==2.0.3 --no-cache-dir
# conda activate py3920
#     pip uninstall pandas -y
#     pip install pandas


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import mlflow
from mlflow.tracking import MlflowClient
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import os
from datetime import datetime, timedelta
import pendulum
import ray

ray.init("ray://10.200.2.51:10001", namespace="experiment-1", log_to_driver=False)

2025-04-18 00:10:43,244	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.


0,1
Python version:,3.9.20
Ray version:,2.40.0
Dashboard:,http://172.20.0.13:8265


In [4]:
# Define your configurations (consistent with the provided information)
TRAINING_CONFIG = {
    "model_path": "model-checkpoints/final-model/xgb_model",
    "test_size": 0.3,
    "num_workers": 1,
    "resources_per_worker": {"CPU": 1},  # 4
    "use_gpu": False,
    "num_boost_round": 1,
}

XGBOOST_PARAMS = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error", "rmse", "mae", "auc"],
    "tree_method": "hist",
    "max_depth": 1,
    "eta": 0.3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

FEATURE_COLUMNS = [
    "brand",
    "price",
    "event_weekday",
    "category_code_level1",
    "category_code_level2",
    "activity_count",
    "is_purchased",
]

CATEGORICAL_COLUMNS = [
    "brand",
    "event_weekday",
    "category_code_level1",
    "category_code_level2",
]

DEFAULT_ARGS = {
    "owner": "airflow",
    "depends_on_past": False,
    "email_on_failure": True,
    "email_on_retry": False,
    "retries": 2,
    "retry_delay": timedelta(minutes=5),
    "retry_exponential_backoff": True,
    "max_retry_delay": timedelta(minutes=30),
    "execution_timeout": timedelta(hours=2),
    "start_date": pendulum.datetime(2024, 1, 1, tz="UTC"),
}

TUNE_CONFIG = {
    "model_path": "model-checkpoints/hyperparameter-tuning/xgb_model",
    "num_trials": 1,  # Number of trials for hyperparameter search
    "max_epochs": 1,  # Maximum epochs per trial
    "grace_period": 1,  # Minimum epochs before pruning
    "mlflow_tracking_uri": os.getenv("MLFLOW_TRACKING_URI", "http://10.200.2.51:5001"),
}

TUNE_SEARCH_SPACE = {
    "max_depth": tune.randint(3, 5),
    "learning_rate": tune.loguniform(1e-4, 1e-1),
    "min_child_weight": tune.choice([1, 2, 3, 4, 5]),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0),
    "gamma": tune.uniform(0, 1),
}

MODEL_NAME = "purchase_prediction_model"

In [5]:
# --- Data Loading and Preprocessing ---
def load_and_preprocess_data(file_path="train_small.csv"):  # Adjust file path as needed
    data = pd.read_csv(file_path)

    # Feature Engineering & Transformations (adapt to your actual data)
    data["event_time"] = pd.to_datetime(data["event_time"])
    data["event_weekday"] = data["event_time"].dt.dayofweek
    data[["category_code_level1", "category_code_level2"]] = (
        data["category_code"].str.split(".", n=1, expand=True)
    )
    data["is_purchased"] = np.random.randint(
        0, 2, size=len(data)
    )  # Replace with actual target variable
    data["activity_count"] = np.random.randint(
        1, 10, size=len(data)
    )  # Example activity count

    # Fill missing values (handle appropriately for your data)
    for col in CATEGORICAL_COLUMNS:
        if col in data.columns:
            data[col] = data[col].fillna("unknown")
    if "brand" in data.columns:
        data["brand"] = data["brand"].fillna("unknown")

    data["price"] = data["price"].fillna(data["price"].median())

    # Label Encoding (fit on the entire dataset for consistency)
    label_encoders = {}
    for col in CATEGORICAL_COLUMNS:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
            label_encoders[col] = le
            print(f"Categories in {col}: {le.classes_}")

    # Select Features
    data = data[FEATURE_COLUMNS].copy()

    return data, label_encoders

In [11]:
def tune_model(data):
    """Performs hyperparameter tuning using Ray Tune."""
    reporter = CLIReporter(
        parameter_columns=list(TUNE_SEARCH_SPACE.keys()), metric_columns=["accuracy"]
    )
    scheduler = ASHAScheduler(
        metric="accuracy",
        mode="max",
        max_t=TUNE_CONFIG["max_epochs"],
        grace_period=TUNE_CONFIG["grace_period"],
    )
    analysis = tune.run(
        tune.with_parameters(train_model, data=data),
        resources_per_trial={"cpu": 1},
        config=TUNE_SEARCH_SPACE,
        num_samples=TUNE_CONFIG["num_trials"],
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_xgboost",
        log_to_file=True,  # <-- thêm dòng này

    )
    
    print(f"Best hyperparameters found: {analysis.best_config}")
    return analysis.best_config

In [12]:
# Set MLflow tracking URI
mlflow.set_tracking_uri(TUNE_CONFIG["mlflow_tracking_uri"])

# Load and preprocess data
data, label_encoders = load_and_preprocess_data()
print("Data loaded and preprocessed successfully.")

Categories in brand: ['a-case' 'a-derma' 'a-elita' ... 'zvezda' 'zwillingjahenckels' 'zyxel']
Categories in event_weekday: [3]
Categories in category_code_level1: ['accessories' 'apparel' 'appliances' 'auto' 'computers' 'construction'
 'country_yard' 'electronics' 'furniture' 'kids' 'medicine' 'sport'
 'stationery' 'unknown']
Categories in category_code_level2: ['accessories.alarm' 'accessories.anti_freeze' 'accessories.compressor'
 'accessories.parktronic' 'accessories.player' 'accessories.radar'
 'accessories.videoregister' 'accessories.winch' 'audio.acoustic'
 'audio.headphone' 'audio.microphone' 'audio.music_tools.piano'
 'audio.subwoofer' 'bag' 'bathroom.bath' 'bathroom.toilet' 'bedroom.bed'
 'bedroom.blanket' 'bedroom.pillow' 'belt' 'bicycle' 'camera.photo'
 'camera.video' 'carriage' 'cartrige' 'clocks' 'components.cooler'
 'components.cpu' 'components.faucet' 'components.hdd' 'components.memory'
 'components.motherboard' 'components.power_supply'
 'components.videocards' 'costum

In [13]:
# %pip -q install pyarrow==13.0.0 --force-reinstall
# %pip -q install cloudpickle==2.2.1

In [14]:
# %pip install "numpy<2" --force-reinstall

In [16]:
from ray.air import session
from xgboost.callback import EarlyStopping

def train_model(config, data):
    try:
        X = data.drop("is_purchased", axis=1)
        y = data["is_purchased"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = xgb.XGBClassifier(**config)
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            callbacks=[EarlyStopping(rounds=10)],
            verbose=False
        )

        accuracy = model.score(X_test, y_test)
        session.report({"accuracy": accuracy})

        with mlflow.start_run():
            mlflow.log_params(config)
            mlflow.xgboost.log_model(model, "model")
            mlflow.log_metric("accuracy", accuracy)

    except Exception as e:
        import traceback
        print("❌ Error in train_model:", traceback.format_exc())
        session.report({"error": str(e)})


In [17]:
from ray import tune
from sklearn.metrics import accuracy_score

def train_model(config, data):
    # Tách dữ liệu
    X_train, X_test, y_train, y_test = train_test_split(...)

    # Khởi tạo model từ config
    model = XGBClassifier(**config)

    # Huấn luyện
    model.fit(X_train, y_train)

    # Dự đoán
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    # Báo cáo về Ray Tune
    tune.report(accuracy=acc)

In [18]:
# Model Tuning
best_params = tune_model(data.copy())
print("Model tuning completed.")

2025-04-18 00:10:56,455	INFO tune.py:539 -- [output] This uses the legacy output and progress reporter, as Ray client is not supported by the new engine. For more information, see https://github.com/ray-project/ray/issues/36949


RayTaskError(TuneError): [36mray::run()[39m (pid=29115, ip=172.20.0.13)
  File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 1035, in run
    raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [train_model_eea72_00000])

In [None]:
# Train and Register Final Model
final_model = train_final_model(data.copy(), best_params)
print("Final model training and registration complete.")

print("Pipeline finished successfully!")

In [None]:
from ray.air import session
from xgboost.callback import EarlyStopping
import traceback

def train_model(config, data):
    """Trains an XGBoost model with given config and logs metrics to Tune and MLflow."""
    try:
        # Separate features and target
        X = data.drop("is_purchased", axis=1)
        y = data["is_purchased"]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Initialize the model with the current config
        model = xgb.XGBClassifier(**config)

        # Add EarlyStopping callback to avoid overfitting and unnecessary training
        early_stopping = EarlyStopping(rounds=10, metric='logloss', data_name='eval_set', save_best=True)

        # Train the model
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="logloss",  # Add relevant evaluation metric
            early_stopping_rounds=10,
            verbose=False,
            callbacks=[early_stopping]
        )

        # Evaluate the model (using accuracy as the metric)
        accuracy = model.score(X_test, y_test)

        # Log metrics for Ray Tune
        tune.report(accuracy=accuracy)

        # Start MLflow run for tracking
        with mlflow.start_run():
            # Log hyperparameters
            mlflow.log_params(config)

            # Log model
            mlflow.xgboost.log_model(model, "model")

            # Log metrics
            mlflow.log_metric("accuracy", accuracy)
            
    except Exception as e:
        # Log error and trace for debugging
        print("❌ Error in train_model:", e)
        print(traceback.format_exc())
        tune.report(error=str(e))  # Report error to Ray Tune

# Hyperparameter tuning with Ray Tune
def tune_model(data):
    """Performs hyperparameter tuning using Ray Tune."""
    reporter = CLIReporter(
        parameter_columns=list(TUNE_SEARCH_SPACE.keys()), metric_columns=["accuracy"]
    )
    scheduler = ASHAScheduler(
        metric="accuracy",
        mode="max",
        max_t=TUNE_CONFIG["max_epochs"],
        grace_period=TUNE_CONFIG["grace_period"],
    )

    try:
        # Run Ray Tune with the configuration
        analysis = tune.run(
            tune.with_parameters(train_model, data=data),
            resources_per_trial={"cpu": 1},
            config=TUNE_SEARCH_SPACE,
            num_samples=TUNE_CONFIG["num_trials"],
            scheduler=scheduler,
            progress_reporter=reporter,
            name="tune_xgboost",
            log_to_file=True,  # Log to file to help in debugging
        )
        
        print(f"Best hyperparameters found: {analysis.best_config}")
        return analysis.best_config
    except Exception as e:
        print("❌ Error in Ray Tune:", e)
        return None


Categories in brand: ['a-case' 'a-derma' 'a-elita' ... 'zvezda' 'zwillingjahenckels' 'zyxel']
Categories in event_weekday: [3]
Categories in category_code_level1: ['accessories' 'apparel' 'appliances' 'auto' 'computers' 'construction'
 'country_yard' 'electronics' 'furniture' 'kids' 'medicine' 'sport'
 'stationery' 'unknown']
Categories in category_code_level2: ['accessories.alarm' 'accessories.anti_freeze' 'accessories.compressor'
 'accessories.parktronic' 'accessories.player' 'accessories.radar'
 'accessories.videoregister' 'accessories.winch' 'audio.acoustic'
 'audio.headphone' 'audio.microphone' 'audio.music_tools.piano'
 'audio.subwoofer' 'bag' 'bathroom.bath' 'bathroom.toilet' 'bedroom.bed'
 'bedroom.blanket' 'bedroom.pillow' 'belt' 'bicycle' 'camera.photo'
 'camera.video' 'carriage' 'cartrige' 'clocks' 'components.cooler'
 'components.cpu' 'components.faucet' 'components.hdd' 'components.memory'
 'components.motherboard' 'components.power_supply'
 'components.videocards' 'costum

2025-04-17 11:04:46,003	INFO tune.py:539 -- [output] This uses the legacy output and progress reporter, as Ray client is not supported by the new engine. For more information, see https://github.com/ray-project/ray/issues/36949


❌ Error in Ray Tune: [36mray::run()[39m (pid=18581, ip=172.20.0.13)
  File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 1035, in run
    raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [train_model_195cc_00000, train_model_195cc_00001])
Best hyperparameters: None


In [None]:
# Run the model tuning
mlflow.set_tracking_uri(TUNE_CONFIG["mlflow_tracking_uri"])
data, label_encoders = load_and_preprocess_data()

In [None]:
best_params = tune_model(data)
print("Best hyperparameters:", best_params)

2025-04-17 11:05:03,764	INFO tune.py:539 -- [output] This uses the legacy output and progress reporter, as Ray client is not supported by the new engine. For more information, see https://github.com/ray-project/ray/issues/36949


❌ Error in Ray Tune: [36mray::run()[39m (pid=19477, ip=172.20.0.13)
  File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 1035, in run
    raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [train_model_25f7e_00000, train_model_25f7e_00001])
Best hyperparameters: None
