### **Modeling Development and Tracking Experiments with mlflow**

In [None]:
# Import Libraries

import pandas as pd
import numpy as np

import math
from scipy.stats import randint, uniform
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import metrics

import seaborn as sns

from pickle import dump
import os

import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
#from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
#from hyperopt.pyll import scope

: 

In [None]:
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

: 

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Laptop Price Prediction")

: 

In [None]:
# Load data

df = pd.read_csv('..\data\cleaned_data2.csv')
df.head()

: 

In [None]:
df.shape

: 

In [None]:
# Generate dictionaries of the categorical columns

def generate_dicts(df):
    categorical_cols = df.select_dtypes(include=['object']).columns
    dicts_cols = {}
    for col in categorical_cols:
        unique_values = df[col].unique()
        dict_col = {}
        for i, value in enumerate(sorted(unique_values)):
            dict_col[value] = i
        dicts_cols[col] = dict_col
    return dicts_cols

dicts_cols = generate_dicts(df)
print(dicts_cols)

# Replace the string values with the corresponding integer values

df = df.replace(dicts_cols)

: 

In [None]:
# Separate the target variable from the features

X = df.drop(columns=['MRP'], axis=1)
y = np.log(df['MRP'])

: 

In [None]:
# Split data into train and test 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

: 

In [None]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Rescale RAM Size and Display columns only

scaler = MinMaxScaler()

X_train_scaled[['Display', 'RAM_Size']] = scaler.fit_transform(X_train_scaled[['Display', 'RAM_Size']])
X_test_scaled[['Display', 'RAM_Size']] = scaler.transform(X_test_scaled[['Display', 'RAM_Size']])

# Create new dataframes with the scaled data and original column names

X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

: 

### **Experiment Tracking**

**XGBoost Regressor**

In [None]:
with mlflow.start_run():

    mlflow.set_tag("algorithm", "XGBoost")

    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "..\data\cleaned_data2.csv")
    mlflow.log_artifact(dicts_cols, "dicts_cols.json")

    xgb = XGBRegressor()
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(xgb, artifact_path="models")
    mlflow.end_run()

: 

**GradientBoosting Regressor**

In [None]:
with mlflow.start_run():

    mlflow.set_tag("algorithm", "Gradient Boost")

    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "..\data\cleaned_data2.csv")
    mlflow.log_artifact(dicts_cols, "dicts_cols.json")
    
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(gbr, artifact_path="models")
    mlflow.end_run()

: 

**Random Forest Regressor**

In [None]:
with mlflow.start_run():

    mlflow.set_tag("algorithm", "Random Forest")

    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "..\data\cleaned_data2.csv")
    mlflow.log_artifact(dicts_cols, "dicts_cols.json")
    
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(rf, artifact_path="models")
    mlflow.end_run()

: 

In [None]:
# Create a Gradient Boosting Regressor object
gbr = GradientBoostingRegressor()

# Define the hyperparameter space
param_dist = {"n_estimators": randint(100, 500),
              "max_features": randint(1, 10),
              "max_depth": [None] + list(randint(1, 11).rvs(10)),
              "min_samples_leaf": randint(1, 10)}


# Define the Halving Random Search cross-validation object
hrscv = HalvingRandomSearchCV(gbr, param_distributions=param_dist, n_candidates=100,
                              factor=2, min_resources='exhaust', 
                              random_state=42, n_jobs=-1)

# Fit the Halving Random Search cross-validation object to the data 
hrscv.fit(X_train, y_train)

# Print the best hyperparameters
print(f"Best hyperparameters for Gradient Boosting Regressor: {hrscv.best_params_}")

Best hyperparameters for Gradient Boosting Regressor: {'max_depth': 4, 'max_features': 3, 'min_samples_leaf': 1, 'n_estimators': 478}


In [None]:
# Create a XGBoost Regressor object
xgbr = XGBRegressor()

# Define the hyperparameter space
param_dist = {"n_estimators": randint(100, 500),
              "max_depth": [None] + list(randint(1, 11).rvs(10)),
              "learning_rate": uniform(0.01, 0.3),
              "gamma": uniform(0, 10)}

# Define the Halving Random Search cross-validation object
hrscv = HalvingRandomSearchCV(xgbr, param_distributions=param_dist, n_candidates=100,
                              factor=2, min_resources='exhaust', 
                              random_state=42, n_jobs=-1)

# Fit the Halving Random Search cross-validation object to the data 
hrscv.fit(X_train, y_train)

# Print the best hyperparameters
print(f"Best hyperparameters for XGBoost Regressor: {hrscv.best_params_}")

Best hyperparameters for XGBoost Regressor: {'gamma': 0.15966252220214194, 'learning_rate': 0.0792681476866447, 'max_depth': 8, 'n_estimators': 363}
