## Notebook with Mlflow Tracking


In [1]:
#import libraries
import pickle
import mlflow

import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns

from xgboost import DMatrix
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment("mlops-project")

<Experiment: artifact_location='/home/ubuntu/mlops-project/mlops_customer_churn_prediction/notebooks/artifacts/1', creation_time=1689392749595, experiment_id='1', last_update_time=1689392749595, lifecycle_stage='active', name='mlops-project', tags={}>

In [3]:
#read data
def read_data(file_path):
    """
    create dataframe from csv file
    params:csv file 
    returns: dataframe 
    rtype: 
    """
    df = pd.read_csv(file_path)
    return df 

In [4]:
#read data
data = read_data("../data/bank-customers/Churn Modeling.csv")

#create copy of data for analysis
df = data.copy()

### Data Preparation and Exploratory Data Analysis


In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [7]:
#numerical information about the dataset
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [8]:
#encoding categorical column
df.Gender.replace(["Female", "Male"], [0, 1], inplace=True)

In [9]:
#removing redundant columns 
df.drop(columns=["RowNumber", "CustomerId", "Surname", "Geography"], inplace=True)

In [10]:
def get_data_splits(df):
    """
    split data for modelling
    returns: data splits
    rtype: pandas dataframe
    """
    #specifying data splits
    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

    return df_full_train, df_train, df_val, df_test

In [11]:
df_full_train, df_train, df_val, df_test = get_data_splits(df)

#specifying target variable
y_full_train = df_full_train['Exited'].values
y_train = df_train['Exited'].values
y_val = df_val['Exited'].values
y_test = df_test['Exited'].values

del df_full_train['Exited']
del df_train['Exited']
del df_val['Exited']
del df_test['Exited']

#converting data splits into arrays 
X_full_train = df_full_train.to_numpy()
X_train = df_train.to_numpy()
X_val = df_val.to_numpy()
X_test = df_test.to_numpy()

In [12]:
"""
Splitting data 
"""
#specifying data splits
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

#specifying target variable
y_full_train = df_full_train['Exited'].values
y_train = df_train['Exited'].values
y_val = df_val['Exited'].values
y_test = df_test['Exited'].values

del df_full_train['Exited']
del df_train['Exited']
del df_val['Exited']
del df_test['Exited']

#converting data splits into arrays 
X_full_train = df_full_train.to_numpy()
X_train = df_train.to_numpy()
X_val = df_val.to_numpy()
X_test = df_test.to_numpy()

### Model Building 

### Logistic Regression

In [13]:
mlflow.sklearn.autolog()

In [14]:
#Evaluating model on test data
with mlflow.start_run():
    
    mlflow.log_param("data", "../data/bank-customers/Churn Modeling.csv")

    lrf = LogisticRegression()

    lrf.fit(X_full_train, y_full_train)
    y_pred_lr = lrf.predict(X_test)

    lr_score = roc_auc_score(y_test, y_pred_lr)

    mlflow.log_metric("roc_auc_score", lr_score)

    print(lr_score)

    mlflow.end_run()



0.5250446915609349


### Random Forest 

In [15]:
#Evaluating model on test data
with mlflow.start_run():
    
    mlflow.log_param("data", "../data/bank-customers/Churn Modeling.csv")
    
    rfr = RandomForestClassifier()

    rfr.fit(X_full_train, y_full_train)

    y_pred_rfr = rfr.predict(X_test)

    rf_score = roc_auc_score(y_test, y_pred_rfr)
    mlflow.log_metric("roc_auc_score", rf_score)

    print(rf_score)

    mlflow.end_run()

0.7001611904660116


### XGBoost

In [16]:
#specifying parameters for training the model
xgb_params = {
    'eta': 0.3,
    'max_depth': 10,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1
}

In [17]:
#Evaluating the model on test data
with mlflow.start_run():

    dmatrix = DMatrix

    mlflow.log_param("data", "../data/bank-customers/Churn Modeling.csv")
    mlflow.log_params(xgb_params)


    d_full_train = dmatrix(X_full_train, label=y_full_train, feature_names=df_train.columns)
    dtest = dmatrix(X_test, label=y_test, feature_names=df_test.columns)
    
    xgb_clf = xgb.train(xgb_params, d_full_train)
    y_pred_xgb = xgb_clf.predict(dtest)

    xgb_score = roc_auc_score(y_test, y_pred_xgb)

    mlflow.log_metric("roc_auc_score", xgb_score)

    print(xgb_score)

    with open("../models/preprocessor.b", "wb") as f_out:
        pickle.dump(dmatrix, f_out)
    
    mlflow.log_artifact("../models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(xgb_clf, artifact_path="model_artifact")
    mlflow.end_run()

0.8311957387447728


In [18]:
#Evaluating the model on test data
mlflow.xgboost.autolog()
version = "v2"

with mlflow.start_run():

    mlflow.log_param("data", "../data/bank-customers/Churn Modeling.csv")

    d_full_train = dmatrix(X_full_train, label=y_full_train, feature_names=df_train.columns)
    dtest = dmatrix(X_test, label=y_test, feature_names=df_test.columns)
    
    xgb_clf = xgb.train(xgb_params, d_full_train)
    y_pred_xgb = xgb_clf.predict(dtest)

    xgb_score = roc_auc_score(y_test, y_pred_xgb)

    mlflow.log_metric("roc_auc_score", xgb_score)

    print(xgb_score)

    with open(f"../models/preprocessor{version}.b", "wb") as f_out:
        pickle.dump(dmatrix, f_out)
    
    mlflow.log_artifact("../models/preprocessor.b", artifact_path="preprocessor")

    mlflow.end_run()



0.8311957387447728
