# **Modeling Development and Tracking Experiments with mlflow**

In [1]:
# Import Libraries

import pandas as pd
import numpy as np

import math
from scipy.stats import randint, uniform
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import metrics

import seaborn as sns

from pickle import dump
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
import mlflow
import mlflow.sklearn

In [17]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(experiment_id='1')

<Experiment: artifact_location='file:///c:/Users/ADMIN/Desktop/Innomatics/DataAnalysis/Predict-Laptop-Prices-Streamlit/notebooks/mlruns/1', creation_time=1683204315728, experiment_id='1', last_update_time=1683204315728, lifecycle_stage='active', name='Laptop Prices Prediction', tags={}>

In [6]:
# Load data

df = pd.read_csv(r'C:\Users\ADMIN\Desktop\Innomatics\DataAnalysis\Predict-Laptop-Prices-Streamlit\data\cleaned_data2.csv')
df.head()

Unnamed: 0,MRP,RAM_Size,RAM_Type,Display,Processor,Storage,OS,Brand
0,36990,8,DDR4,14.0,Intel Core i3,256 GB SSD,Windows,Lenovo
1,39990,8,DDR4,15.6,Intel Core i3,512 GB SSD,Windows,Lenovo
2,32990,8,DDR4,15.6,Intel Core i3,512 GB SSD,Windows,ASUS
3,49990,8,DDR4,15.6,AMD Ryzen 5 Hexa Core,512 GB SSD,Windows,HP
4,49990,8,DDR4,15.6,Intel Core i5,512 GB SSD,Windows,ASUS


In [5]:
df.shape

(423, 8)

In [8]:
# Generate dictionaries of the categorical columns

def generate_dicts(df):
    categorical_cols = df.select_dtypes(include=['object']).columns
    dicts_cols = {}
    for col in categorical_cols:
        unique_values = df[col].unique()
        dict_col = {}
        for i, value in enumerate(sorted(unique_values)):
            dict_col[value] = i
        dicts_cols[col] = dict_col
    return dicts_cols

dicts_cols = generate_dicts(df)
print(dicts_cols)

{'RAM_Type': {'DDR4': 0, 'DDR5': 1, 'LPDDR3': 2, 'LPDDR4': 3, 'LPDDR4X': 4, 'LPDDR5': 5, 'Unified Memory': 6}, 'Processor': {'AMD Athlon Dual Core': 0, 'AMD Ryzen 3': 1, 'AMD Ryzen 3 Dual Core': 2, 'AMD Ryzen 3 Hexa Core': 3, 'AMD Ryzen 3 Quad Core': 4, 'AMD Ryzen 5': 5, 'AMD Ryzen 5 Dual Core': 6, 'AMD Ryzen 5 Hexa Core': 7, 'AMD Ryzen 5 Quad Core': 8, 'AMD Ryzen 7 Octa Core': 9, 'AMD Ryzen 7 Quad Core': 10, 'AMD Ryzen 9 Octa Core': 11, 'Intel Celeron Dual Core': 12, 'Intel Celeron Quad Core': 13, 'Intel Core i3': 14, 'Intel Core i5': 15, 'Intel Core i7': 16, 'Intel Core i9': 17, 'Intel Evo Core i5': 18, 'Intel Pentium Quad Core': 19, 'Intel Pentium Silver': 20, 'M1': 21, 'M1 Max': 22, 'M1 Pro': 23, 'M2': 24, 'Qualcomm Snapdragon 7c Gen 2': 25}, 'Storage': {'1 TB HDD': 0, '1 TB HDD, 128 GB SSD': 1, '1 TB HDD, 256 GB SSD': 2, '1 TB HDD, 512 GB SSD': 3, '128 GB SSD': 4, '256 GB SSD': 5, '512 GB SSD': 6}, 'OS': {'Chrome': 0, 'DOS': 1, 'Mac OS': 2, 'Windows': 3}, 'Brand': {'ALIENWARE': 0,

In [9]:
# Replace the string values with the corresponding integer values

df = df.replace(dicts_cols)
df.head(3)

Unnamed: 0,MRP,RAM_Size,RAM_Type,Display,Processor,Storage,OS,Brand
0,36990,8,0,14.0,14,5,3,7
1,39990,8,0,15.6,14,6,3,7
2,32990,8,0,15.6,14,6,3,2


In [10]:
# Separate the target variable from the features

X = df.drop(columns=['MRP'], axis=1)
y = np.log(df['MRP'])

In [11]:
# Split data into train and test 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((296, 7), (296,), (127, 7), (127,))

In [12]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

In [13]:
# Rescale RAM Size and Display columns only

scaler = MinMaxScaler()

X_train_scaled[['Display', 'RAM_Size']] = scaler.fit_transform(X_train_scaled[['Display', 'RAM_Size']])
X_test_scaled[['Display', 'RAM_Size']] = scaler.transform(X_test_scaled[['Display', 'RAM_Size']])

In [14]:
# Create new dataframes with the scaled data and original column names

X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

## **Experiment Tracking**

In [16]:
mlflow.sklearn.autolog()

# Define a list of model classes
model_classes = [RandomForestRegressor, GradientBoostingRegressor, XGBRegressor, Ridge]

for model_class in model_classes:
    with mlflow.start_run():
        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)

        # Log the metrics using MLflow's autologging feature
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

#### **1. XGBoost Regressor**

In [13]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Peris")
    mlflow.set_tag("algo", "XGB")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "Data\cleaned_data2.csv")
    xgb = XGBRegressor()
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(xgb, artifact_path="models")
    mlflow.end_run()

## **Base Models**

In [14]:
# Putting models into a dictionary

models = {
          "Random Forest" : RandomForestRegressor(),
          "XGBoost": XGBRegressor(),
          "Gradient Boost" : GradientBoostingRegressor(),
          "Ridge" : Ridge ()
          }

# Function that will fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    random_state=42
    # Making a dictionary to keep model scores
    model_scores = {}
    # Looping through models
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores   
    
scores = fit_and_score(models = models,
                      X_train = X_train,
                      X_test = X_test,
                      y_train = y_train,
                      y_test = y_test)
scores

{'Random Forest': 0.827337825280219,
 'XGBoost': 0.879105033027367,
 'Gradient Boost': 0.8394545540141549,
 'Ridge': 0.6059601827548748}

XGBoost Regressor has the highest R-Squared score with 0.8791

In [16]:
mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, XGBRegressor, Ridge):

    with mlflow.start_run():
        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

**XGBoost**

In [None]:
XGB = XGBRegressor()

XGB.fit(X_train, y_train)
prediction = XGB.predict(X_test)

# evaluate the model
mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, prediction)

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-squared: ", r2)

MAE:  0.1657141464665079
MSE:  0.048516012131371966
RMSE:  0.2202635061270295
R-squared:  0.8580264975283868


**Gradient Boosting**

In [None]:
GBR = GradientBoostingRegressor()

GBR.fit(X_train, y_train)
prediction = GBR.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test, prediction)
mae = mean_absolute_error(y_test, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, prediction)

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-squared: ", r2)

MAE:  0.17381410331965777
MSE:  0.04676302889593545
RMSE:  0.2162476101508071
R-squared:  0.8631562919771768


### Hyperparameter Tuning

In [None]:
# Create a Gradient Boosting Regressor object
gbr = GradientBoostingRegressor()

# Define the hyperparameter space
param_dist = {"n_estimators": randint(100, 500),
              "max_features": randint(1, 10),
              "max_depth": [None] + list(randint(1, 11).rvs(10)),
              "min_samples_leaf": randint(1, 10)}


# Define the Halving Random Search cross-validation object
hrscv = HalvingRandomSearchCV(gbr, param_distributions=param_dist, n_candidates=100,
                              factor=2, min_resources='exhaust', 
                              random_state=42, n_jobs=-1)

# Fit the Halving Random Search cross-validation object to the data 
hrscv.fit(X_train, y_train)

# Print the best hyperparameters
print(f"Best hyperparameters for Gradient Boosting Regressor: {hrscv.best_params_}")

Best hyperparameters for Gradient Boosting Regressor: {'max_depth': 4, 'max_features': 3, 'min_samples_leaf': 1, 'n_estimators': 478}


In [None]:
# Create and train the model with the tuned hyperparameters
gbr = GradientBoostingRegressor(max_features = 4, min_samples_leaf = 2, n_estimators = 478, random_state=42)
gbr.fit(X_train, y_train)

# Predictions
gbr_pred = gbr.predict(X_test)

# Back-transform predicted values to original scale
y_pred_orig = np.exp(gbr_pred)

# Evaluate the model 
mae = mean_absolute_error(y_test,gbr_pred)
mse= mean_squared_error(y_test, gbr_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, gbr_pred)

print("Mean Absolute Error: {:.4f}".format(mae))
print("Mean Squared Error: {:.4f}".format(mse))
print("Root Mean Squared Error :{:.4f}".format(rmse))
print("R-Squared :{:.4f}".format(r2))

Mean Absolute Error: 0.1534
Mean Squared Error: 0.0427
Root Mean Squared Error :0.2067
R-Squared :0.8750


In [None]:
print(y_pred_orig)

[111878.02474918  85966.45982977  71936.35002333  75873.74347313
  78610.7744282   38089.60994035  53556.63305777  57971.99209666
 156629.3700875   59810.73502655  42062.14907589  38623.45106546
  68100.55897944  51255.98179883  53556.63305777 142532.38589242
  75243.74326224  73648.46122934 243539.99502153 208074.06970458
  34124.18969853  61600.05942806  91368.45717021  37438.47993027
  57971.99209666 175338.44946748  54795.94656096  33076.46116498
  66601.92919762 111878.02474918  58558.54361363  21720.45233006
  57525.03230027  41571.20637763  66900.25502172  40696.41816961
  69961.58300074  57059.39202462  56721.179356    68920.66855983
  81075.85316442 130805.68883236 101679.27946159 181426.0727614
  78005.32056007  44883.54934649  44128.50488455  59810.73502655
 195896.3540418  181426.0727614   47053.10130373  74813.25080525
  40489.67122597 122566.29156132  67397.86790506 146855.78944435
  51827.71925217  38236.44658775 117777.02317244  38623.45106546
 145267.49886913  44086.86

In [None]:
# Create a XGBoost Regressor object
xgbr = XGBRegressor()

# Define the hyperparameter space
param_dist = {"n_estimators": randint(100, 500),
              "max_depth": [None] + list(randint(1, 11).rvs(10)),
              "learning_rate": uniform(0.01, 0.3),
              "gamma": uniform(0, 10)}

# Define the Halving Random Search cross-validation object
hrscv = HalvingRandomSearchCV(xgbr, param_distributions=param_dist, n_candidates=100,
                              factor=2, min_resources='exhaust', 
                              random_state=42, n_jobs=-1)

# Fit the Halving Random Search cross-validation object to the data 
hrscv.fit(X_train, y_train)

# Print the best hyperparameters
print(f"Best hyperparameters for XGBoost Regressor: {hrscv.best_params_}")

Best hyperparameters for XGBoost Regressor: {'gamma': 0.15966252220214194, 'learning_rate': 0.0792681476866447, 'max_depth': 8, 'n_estimators': 363}


In [None]:
# Create and train the model with the tuned hyperparameters
xgbr = XGBRegressor(gamma= 0.15966252220214194, learning_rate = 0.0792681476866447 , max_depth = 8, n_estimators = 363)
xgbr.fit(X_train, y_train)

# Predictions
xgbr_pred = xgbr.predict(X_test)

# Back-transform predicted values to original scale
y_pred_orig = np.exp(xgbr_pred)

# Evaluate the model 
mae = mean_absolute_error(y_test,xgbr_pred)
mse= mean_squared_error(y_test, xgbr_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, xgbr_pred)

print("Mean Absolute Error: {:.4f}".format(mae))
print("Mean Squared Error: {:.4f}".format(mse))
print("Root Mean Squared Error :{:.4f}".format(rmse))
print("R-Squared :{:.4f}".format(r2))

Mean Absolute Error: 0.1815
Mean Squared Error: 0.0507
Root Mean Squared Error :0.2251
R-Squared :0.8517


In [None]:
print(y_pred_orig)

[105573.484 104733.23   67688.53   79563.85   87154.26   48621.055
  48735.953  51951.78  109772.664  61963.56   46112.227  46992.75
  81719.01   56031.918  48735.953 139825.88   83239.43   84159.2
 175871.02  171359.4    50630.95   51791.26   64538.61   36849.98
  51951.78  159431.58   54277.027  36739.688  54050.473 105573.484
  54609.066  29748.621  52682.26   47263.043  85497.56   52107.74
  59099.383  50878.92   48279.363  77687.555  88311.97  128958.91
  95161.    136966.64   95477.07   48127.707  44824.293  61963.56
 140539.62  136966.64   53858.188  53050.     45680.22  109772.664
  51940.586 131017.71   49952.25   47234.297 135163.12   46992.75
 168303.4    51184.31   81719.01   52107.74   70591.11   53726.293
  60006.65   50713.87   82634.27  103093.76   30449.559  36849.98
  51387.523  17123.78   33132.152  53911.48  110595.12   87154.26
  24327.56  136966.64   47558.52  106460.484  50713.87   51387.523
  50760.715  81719.01   49666.574 105758.91   31389.357  82634.27
  5086

In [None]:
df.to_csv('cleaned_data3.csv', index=False)    

In [None]:
# Save the models

dump(dicts_cols, open('dicts.pkl', 'wb'))
dump(scaler, open('scaler.pkl', 'wb'))
dump(gbr, open('gbr_model.pkl', 'wb'))
dump(xgbr, open('xgbr_model.pkl', 'wb'))