In [71]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from zenml import step, pipeline, log_metadata
from mlflow.models.signature import infer_signature


In [72]:
data = pd.read_csv('/home/sumair/Desktop/mlflow/Mlops/data/heart.csv')

In [73]:
data.head(2)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2


In [74]:
data.replace('?', np.nan, inplace=True)
data = data.apply(pd.to_numeric, errors='coerce')

In [75]:
data.dropna(subset=['num'], inplace=True)

In [76]:
data['target'] = data['num'].apply(lambda x: 1 if x > 0 else 0)
data.drop('num', axis=1, inplace=True)

In [77]:
X = data.drop('target', axis=1)
y = data['target']

In [78]:
categorical = X.select_dtypes(include=['object']).columns.tolist()
numerical = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [79]:
# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical),
    ('cat', categorical_pipeline, categorical)
])


In [87]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network (MLP)": MLPClassifier(max_iter=1000)
}

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [88]:
# Set MLflow to point to the local tracking server
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Heart Disease Classification")

# End any previous active run if stuck
if mlflow.active_run():
    mlflow.end_run()

# Start loop
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Build full pipeline
        pipeline = Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', model)
        ])

        # Train and predict
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # Evaluate
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

        # Log to MLflow
        mlflow.log_param("model_name", name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)

        # Log model with input signature
        input_example = X_test.sample(1)
        signature = infer_signature(X_test, y_pred)

        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="model",
            input_example=input_example,
            signature=signature
        )

        # Console output
        print(f"\n📊 {name}")
        print(f"Accuracy :  {acc:.3f}")
        print(f"Precision:  {prec:.3f}")
        print(f"Recall   :  {rec:.3f}")
        print(f"F1 Score :  {f1:.3f}")

        # Optional: show confusion matrix
        ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
        plt.title(name)
        plt.show()


2025/05/07 00:26:53 INFO mlflow.tracking.fluent: Experiment with name 'Heart Disease Classification' does not exist. Creating a new experiment.


🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/1/runs/6ed3a12b851d411d9f190682ef92d80d
🧪 View experiment at: http://localhost:5000/#/experiments/1
