In [1]:
# !pip show scikit-learn
%pip -q install xgboost==1.7.6 scikit-learn==1.3.2


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
import os

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# --- Data Loading and Preprocessing ---
def load_and_preprocess_data(file_path="train_small.csv"):
    data = pd.read_csv(file_path)

    # Feature Engineering
    data["event_time"] = pd.to_datetime(data["event_time"])
    data["event_weekday"] = data["event_time"].dt.dayofweek
    data[["category_code_level1", "category_code_level2"]] = (
        data["category_code"].str.split(".", n=1, expand=True)
    )
    data["is_purchased"] = np.random.randint(0, 2, size=len(data))
    data["activity_count"] = np.random.randint(1, 10, size=len(data))

    # Fill missing
    for col in ["brand", "event_weekday", "category_code_level1", "category_code_level2"]:
        data[col] = data[col].fillna("unknown")
    data["price"] = data["price"].fillna(data["price"].median())

    # Label Encoding
    for col in ["brand", "event_weekday", "category_code_level1", "category_code_level2"]:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

    # Select features
    data = data[["brand", "price", "event_weekday", "category_code_level1", 
                 "category_code_level2", "activity_count", "is_purchased"]].copy()
    return data

In [None]:
# --- GridSearch + MLflow ---
def grid_search_with_mlflow(data):
    X = data.drop("is_purchased", axis=1)
    y = data["is_purchased"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        "max_depth": [3, 4, 5],
        "learning_rate": [0.05, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [50, 100]
    }

    xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss", verbosity=0)

    grid_search = GridSearchCV(
        estimator=xgb_clf,
        param_grid=param_grid,
        scoring="accuracy",
        cv=3,
        verbose=1,
        n_jobs=-1,
        return_train_score=True,
    )

    grid_search.fit(X_train, y_train)

    # Log best model to MLflow
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    logloss = log_loss(y_test, y_proba)

    with mlflow.start_run(run_name="grid_search_xgb"):
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("auc", auc)
        mlflow.log_metric("logloss", logloss)

        import mlflow
        import mlflow.sklearn
        import mlflow.models


        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="xgb_grid_model",
            input_example=X_test.head(1),
            signature=mlflow.models.infer_signature(X_test, y_pred)
        )
        print(f"✅ Best model logged with accuracy: {acc:.4f} | AUC: {auc:.4f}")


In [4]:
# --- Setup and Run ---
mlflow.set_tracking_uri("http://10.200.2.51:5000")  # MLflow server
try:
    mlflow.set_experiment("xgboost_grid_search")
except:
    mlflow.set_experiment("local_xgboost_grid")


2025/04/18 08:58:39 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_grid_search' does not exist. Creating a new experiment.


In [5]:
data = load_and_preprocess_data()

In [8]:
grid_search_with_mlflow(data)

Fitting 3 folds for each of 24 candidates, totalling 72 fits




UnboundLocalError: local variable 'mlflow' referenced before assignment