In [None]:
import mlflow
mlflow.set_tracking_uri("file:../mlruns")

import sys
sys.path.append('../src')
from functions import *

import mlflow.catboost
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import time

In [None]:
file_name = "ryan_CatBoost"
mlflow.set_experiment(f"{file_name}")
data_path = f"../data/processed/train.csv"
df = pd.read_csv(data_path)

# Define Hyperparameters
params = {
        'iterations': 500,
        'learning_rate': 0.1,
        'depth': 16,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'verbose': 100,
        'early_stopping_rounds': 50
}

# Create CatBoost Pool

cat_features = ['Sex']
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)


# Dataset Preparation

# Shuffle the dataset (Important!)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Train Test Split
X = df.drop(columns=['id','Heart Disease'])
y = df['Heart Disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

display(X_train.head())

In [None]:


with mlflow.start_run() as run:

    # Log dataset
    mlflow.log_param("dataset_name", file_name)
    # mlflow.log_artifact(data_path, artifact_path="datasets")

    # Log source code
    mlflow.log_artifact("../src/functions.py", artifact_path="source_code")
    mlflow.log_artifact(f"{file_name}.ipynb", artifact_path="source_code")

    # Log Hyperparameters
    mlflow.log_params(params)

    model = CatBoostClassifier(**params)

    # Train the model
    model.fit(
        train_pool,
        eval_set=test_pool, 
        use_best_model=True
    )

    # Make Predictions
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    # Calculate Metrics
    accuracy = accuracy_score(y_test, preds)
    roc_auc = roc_auc_score(y_test, probs)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("roc_auc", roc_auc)

    # Log best iteration
    best_iteration = model.get_best_iteration()
    mlflow.log_param("best_iteration", best_iteration)


    # Log the model
    mlflow.catboost.log_model(model, artifact_path="model")

    print(f"Done")