In [2]:
# Imports
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import mlflow
import mlflow.sklearn

In [3]:
# Paths
DATA_PATH = "../data/processed/"
MODEL_PATH = "../models/return_model.joblib"

In [4]:
# Load data
X_train = pd.read_parquet(DATA_PATH + "X_train.parquet")
y_train = pd.read_parquet(DATA_PATH + "y_train.parquet")
X_test = pd.read_parquet(DATA_PATH + "X_test.parquet")
y_test = pd.read_parquet(DATA_PATH + "y_test.parquet")

In [5]:
# Simple check
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (91259, 6), Test shape: (22815, 6)


In [9]:
# Start MLflow run
with mlflow.start_run():
    # Initialize model with hyperparameters
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    
    # Train
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    
    # Log parameters & metrics
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    
    # Save model
    joblib.dump(clf, MODEL_PATH)
    mlflow.sklearn.log_model(clf, "return_model")

print("✅ Training complete and logged to MLflow.")

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.8769
F1 Score: 0.3641
              precision    recall  f1-score   support

           0       0.88      0.98      0.93     19498
           1       0.73      0.24      0.36      3317

    accuracy                           0.88     22815
   macro avg       0.81      0.61      0.65     22815
weighted avg       0.86      0.88      0.85     22815





✅ Training complete and logged to MLflow.
