In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import mlflow
import os
from datetime import timedelta
import pendulum

# --- Data Loading and Preprocessing ---
def load_and_preprocess_data(file_path="train_small.csv"):
    data = pd.read_csv(file_path)

    # Feature Engineering & Transformations
    data["event_time"] = pd.to_datetime(data["event_time"])
    data["event_weekday"] = data["event_time"].dt.dayofweek
    data[["category_code_level1", "category_code_level2"]] = (
        data["category_code"].str.split(".", n=1, expand=True)
    )
    data["is_purchased"] = np.random.randint(0, 2, size=len(data))  # Replace with actual target
    data["activity_count"] = np.random.randint(1, 10, size=len(data))  # Example feature

    # Fill missing values
    for col in ["brand", "event_weekday", "category_code_level1", "category_code_level2"]:
        if col in data.columns:
            data[col] = data[col].fillna("unknown")
    data["price"] = data["price"].fillna(data["price"].median())

    # Label Encoding
    label_encoders = {}
    for col in ["brand", "event_weekday", "category_code_level1", "category_code_level2"]:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
            label_encoders[col] = le
            print(f"Categories in {col}: {le.classes_}")

    # Select Features
    data = data[["brand", "price", "event_weekday", "category_code_level1", 
                 "category_code_level2", "activity_count", "is_purchased"]].copy()

    return data, label_encoders

# --- Model Training ---
def train_model(data):
    """Train an XGBoost model, save checkpoint, and log to MLflow."""

    # Separate features and target
    X = data.drop("is_purchased", axis=1)
    y = data["is_purchased"]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Prepare DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Set parameters for training
    params = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error", "rmse", "mae", "auc"]
    }

    # Train the model with early stopping and capture evaluation results
    evals = [(dtest, "eval"), (dtrain, "train")]
    evals_result = {}  # Dictionary to store evaluation metrics
    model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, 
                      early_stopping_rounds=5, verbose_eval=True, evals_result=evals_result)

    # Save model checkpoint to /thu/checkpoint
    checkpoint_dir = "model-checkpoints/final-model/xgb_model"
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(checkpoint_dir, "xgboost_model.ubj")
    model.save_model(checkpoint_path)
    print(f"Model checkpoint saved to {checkpoint_path}")

    # Evaluate the model on the test set
    y_pred_proba = model.predict(dtest)
    y_pred = (y_pred_proba > 0.5).astype(int)
    accuracy = np.mean(y_pred == y_test)

    # Start MLflow run for tracking
    try:
        with mlflow.start_run(run_name="model_training"):
            # Log hyperparameters
            mlflow.log_params({
                "objective": "binary:logistic",
                "eval_metric": ["logloss", "error", "rmse", "mae", "auc"],
                "num_boost_round": 100,
                "early_stopping_rounds": 10
            })

            # Log evaluation metrics for each boosting round
            for metric in params["eval_metric"]:
                for dataset in ["eval", "train"]:
                    metric_name = f"{dataset}-{metric}"
                    metric_values = evals_result[dataset][metric]
                    for step, value in enumerate(metric_values):
                        mlflow.log_metric(metric_name, value, step=step)

            # Log accuracy
            mlflow.log_metric("accuracy", accuracy)
            print(f"Metric logged: accuracy = {accuracy}")

    except Exception as e:
        print(f"MLflow logging failed: {e}")
        print("Model training completed, but metrics may not have been logged.")

    print(f"Model training completed with accuracy: {accuracy}")

In [2]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://10.200.2.51:5001")  # Remote server
# Fallback to local tracking if server fails
try:
    mlflow.set_experiment("xgboost-training")
except Exception as e:
    print(f"Failed to connect to remote MLflow server: {e}")
    print("Switching to local tracking...")
    mlflow.set_experiment("xgboost_training")

# Load and preprocess data
data, label_encoders = load_and_preprocess_data()
print("Data loaded and preprocessed successfully.")

# Train and log model
train_model(data)

print("Pipeline finished successfully!")

2025/04/18 01:03:45 INFO mlflow.tracking.fluent: Experiment with name 'xgboost-training' does not exist. Creating a new experiment.


Categories in brand: ['a-case' 'a-derma' 'a-elita' ... 'zvezda' 'zwillingjahenckels' 'zyxel']
Categories in event_weekday: [3]
Categories in category_code_level1: ['accessories' 'apparel' 'appliances' 'auto' 'computers' 'construction'
 'country_yard' 'electronics' 'furniture' 'kids' 'medicine' 'sport'
 'stationery' 'unknown']
Categories in category_code_level2: ['accessories.alarm' 'accessories.anti_freeze' 'accessories.compressor'
 'accessories.parktronic' 'accessories.player' 'accessories.radar'
 'accessories.videoregister' 'accessories.winch' 'audio.acoustic'
 'audio.headphone' 'audio.microphone' 'audio.music_tools.piano'
 'audio.subwoofer' 'bag' 'bathroom.bath' 'bathroom.toilet' 'bedroom.bed'
 'bedroom.blanket' 'bedroom.pillow' 'belt' 'bicycle' 'camera.photo'
 'camera.video' 'carriage' 'cartrige' 'clocks' 'components.cooler'
 'components.cpu' 'components.faucet' 'components.hdd' 'components.memory'
 'components.motherboard' 'components.power_supply'
 'components.videocards' 'costum