In [3]:
! pip install mlflow



In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
import mlflow
import mlflow.sklearn


# Pipeline: A way to streamline data preprocessing.
# SimpleImputer: Handles missing values.
# StandardScaler: Standardizes numerical data.
# OrdinalEncoder: Encodes categorical data as numbers.

In [5]:
# MLflow tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# It will run in my local site only

In [8]:
# Load data
train_data = pd.read_csv('../data/train_set.csv')
train_data.sample(5)

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
34508,58,30605,62185,844,105,4,9.33,36,0.36,High School,Part-time,Divorced,No,Yes,Business,Yes,0
149874,39,54221,28068,741,44,2,15.23,36,0.76,Bachelor's,Full-time,Divorced,Yes,No,Home,No,0
177380,40,43388,169921,482,60,3,7.84,12,0.76,Master's,Self-employed,Married,No,No,Auto,Yes,0
98813,42,79475,201613,675,16,3,4.38,36,0.26,Master's,Full-time,Single,No,Yes,Business,No,0
101804,58,117815,232784,766,2,2,4.03,24,0.81,Bachelor's,Unemployed,Married,No,Yes,Home,Yes,0


In [9]:
# separate feature and target variable
X = train_data.drop('Default', axis=1)  
y = train_data['Default'].copy()


In [10]:
# Split Data into Training (80%) and Validation Sets (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.2, 
                                                  random_state=42)


In [25]:
# numeric_columns identifies all columns with numeric data types.
numeric_columns = X_train.select_dtypes(include=np.number).columns

# categorical_columns identifies all columns with object data types
categorical_columns = X_train.select_dtypes(include='object').columns

numeric_columns

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio'],
      dtype='object')

In [26]:
categorical_columns

Index(['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage',
       'HasDependents', 'LoanPurpose', 'HasCoSigner'],
      dtype='object')

In [27]:
# Fills missing numerical values with the mean.
# Standardizes the numerical features.

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
# Fills missing categorical values with the most frequent category.
# Encodes categorical data into numbers.

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', 
                               unknown_value=-1))
])



In [None]:
# Combines the numeric and categorical pipelines into a single preprocessing step using ColumnTransformer.

pre_processing_pipeline = ColumnTransformer([
    ('num', numerical_pipeline, numeric_columns),
    ('cat', categorical_pipeline, categorical_columns)
])

In [28]:
# Start MLflow experiment
mlflow.set_experiment("default_prediction_experiment")

'''
Parameters:
pipeline: It includes data preprocessing steps and a model.
model_name: A string that identifies the model being trained, used for logging purposes.
param_grid: Optional parameter grid for hyperparameter tuning using GridSearchCV.

'''

def train_and_evaluate_with_mlflow(pipeline, model_name, param_grid=None):
    with mlflow.start_run(run_name=model_name):
        
        if param_grid:
            model = GridSearchCV(pipeline, 
                                 param_grid, 
                                 cv=3, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1)
            
            model.fit(X_train, y_train)
            best_params = model.best_params_
            mlflow.log_params(best_params)
            model = model.best_estimator_
        else:
            model = pipeline.fit(X_train, y_train)

        # Model Evaluation
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Logging Metrics and Models with MLflow
        mlflow.log_metric("rmse", rmse)
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} RMSE: {rmse:.4f}")
        joblib.dump(model, f'../models/{model_name}_pipeline.pkl')
        
        return model

In [29]:
# Linear Regression
linear_pipeline = Pipeline([
    ('pre_processing', pre_processing_pipeline),
    ('model', LinearRegression())
])
linear_model = train_and_evaluate_with_mlflow(linear_pipeline, 'linear_model')

linear_pipeline




linear_model RMSE: 0.3073
🏃 View run linear_model at: http://127.0.0.1:5000/#/experiments/938305917412530820/runs/aa63faf67095409ba4aa4a70cef81467
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/938305917412530820


In [31]:
# Random Forest with hyperparameter tuning
rf_pipeline = Pipeline([
    ('pre_processing', pre_processing_pipeline),
    ('model', RandomForestRegressor(random_state=42))
])

param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20]
}

rf_model = train_and_evaluate_with_mlflow(rf_pipeline, 'random_forest_model', param_grid)

rf_pipeline



random_forest_model RMSE: 0.3039
🏃 View run random_forest_model at: http://127.0.0.1:5000/#/experiments/938305917412530820/runs/08897644914246a3a0fe5e43f523f576
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/938305917412530820


In [32]:
# MLP Regressor
mlp_pipeline = Pipeline([
    ('pre_processing', pre_processing_pipeline),
    ('model', MLPRegressor(hidden_layer_sizes=(100, 50), 
                           activation='relu', 
                           solver='adam', 
                           max_iter=500, 
                           random_state=42))
])
mlp_model = train_and_evaluate_with_mlflow(mlp_pipeline, 'mlp_model')

mlp_pipeline

# Uses two hidden layers: 100 neurons in the first, 50 in the second.
# ReLU activation function for non-linearity.
# Adam optimizer for weight updates.
# 500 iterations for training.



mlp_model RMSE: 0.3088
🏃 View run mlp_model at: http://127.0.0.1:5000/#/experiments/938305917412530820/runs/00e6a4c6211c425ebb6a6609f3f33b9b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/938305917412530820


In [33]:
# Ensemble Method: Voting Regressor
ensemble_pipeline = VotingRegressor(estimators=[
    ('linear', linear_model),
    ('rf', rf_model),
    ('mlp', mlp_model)
])

ensemble_model = train_and_evaluate_with_mlflow(ensemble_pipeline, 'ensemble_model')

ensemble_pipeline



ensemble_model RMSE: 0.3033
🏃 View run ensemble_model at: http://127.0.0.1:5000/#/experiments/938305917412530820/runs/523985624e454e62a525c776430a85c0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/938305917412530820


In [34]:
# Save final ensemble model
joblib.dump(ensemble_model, '../models/final_ensemble_model.pkl')

['../models/final_ensemble_model.pkl']