In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\project\\Customer Purchase Prediction'

In [3]:
import pandas as pd

In [4]:
train_data = pd.read_csv("artifacts/data_transformation/train.csv")
test_data = pd.read_csv("artifacts/data_transformation/test.csv")

X_train = train_data.drop(['Sleep efficiency'], axis=1)
X_test = test_data.drop(['Sleep efficiency'], axis=1)
y_train = train_data['Sleep efficiency']
y_test = test_data['Sleep efficiency']

In [5]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings

In [6]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [7]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Support Vector Regression": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoost Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

# Initialize lists to store model names and R2 scores
model_list = []
mae_list = []
rmse_list = []
r2_list = []

# Loop through each model, train, predict, and evaluate
for name, model in models.items():
    model.fit(X_train, y_train) 
    
    y_train_pred = model.predict(X_train)  
    y_test_pred = model.predict(X_test)  
    
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(name)
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    model_list.append(name)
    mae_list.append(model_test_mae)
    rmse_list.append(model_test_rmse)
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.0943
- Mean Absolute Error: 0.0755
- R2 Score: 0.5079
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1001
- Mean Absolute Error: 0.0774
- R2 Score: 0.4638


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.1345
- Mean Absolute Error: 0.1133
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1369
- Mean Absolute Error: 0.1176
- R2 Score: -0.0021


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.0943
- Mean Absolute Error: 0.0755
- R2 Score: 0.5078
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1002
- Mean Absolute Error: 0.0773
- R2 Score: 0.4636


Support Vector Regression
Model performance for Training set
- Root Mean Squared Error: 0.1050
- Mean Absolute Error: 0.0888
- R2 Score: 0.3903
------------------

In [8]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Support Vector Regression": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(objective='reg:squarederror', random_state=42), 
    "LGBMRegressor": LGBMRegressor(verbose=-1),
    "CatBoost Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

params = {
    "Linear Regression": {},
    "Lasso": {
        'alpha': [0.1, 1, 10, 100]
    },
    "Ridge": {
        'alpha': [0.1, 1, 10, 100]
    },
    "Support Vector Regression": {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.1, 0.2, 0.5],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    "K-Neighbors Regressor": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    "Decision Tree": {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30]
    },
    "Random Forest Regressor": {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False],
        'criterion': ['squared_error', 'absolute_error']
    },
    "XGBRegressor": {
        'n_estimators': [None, 50, 100],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [None, 3, 5],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.2, 0.4, 0.6, 0.8]
    },
    "CatBoost Regressor": {
        'iterations': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [4, 6, 10]
    },
    "AdaBoost Regressor": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'loss': ['linear', 'square', 'exponential']
    }
}

# Initialize lists to store model names and R2 scores
model_tuning_list = []
mae_tuning_list = []
rmse_tuning_list = []
r2_tuning_list = []
training_time_list = []
predicting_time_list = []

# Loop through each model, train, predict, and evaluate
for name, model in models.items():
    param_grid = params.get(name, {})
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
    
    start_training_time = time.time()
    grid_search.fit(X_train, y_train)
    end_training_time = time.time()
    training_time = end_training_time - start_training_time
    
    best_model = grid_search.best_estimator_
    
    start_predicting_time = time.time()
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    end_predicting_time = time.time()
    predicting_time = end_predicting_time - start_predicting_time
    
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(name)
    print("Best parameters found: ", grid_search.best_params_)
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')
    
    # Append results to lists
    model_tuning_list.append(name)
    mae_tuning_list.append(model_test_mae)
    rmse_tuning_list.append(model_test_rmse)
    r2_tuning_list.append(model_test_r2)
    training_time_list.append(training_time)
    predicting_time_list.append(predicting_time)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Linear Regression
Best parameters found:  {}
Model performance for Training set
- Root Mean Squared Error: 0.0943
- Mean Absolute Error: 0.0755
- R2 Score: 0.5079
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1001
- Mean Absolute Error: 0.0774
- R2 Score: 0.4638


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Lasso
Best parameters found:  {'alpha': 0.1}
Model performance for Training set
- Root Mean Squared Error: 0.1330
- Mean Absolute Error: 0.1126
- R2 Score: 0.0224
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1360
- Mean Absolute Error: 0.1169
- R2 Score: 0.0110


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Ridge
Best parameters found:  {'alpha': 10}
Model performance for Training set
- Root Mean Squared Error: 0.0946
- Mean Absolute Error: 0.0759
- R2 Score: 0.5055
-------------------------------

In [9]:
Models_comparing_table = pd.DataFrame({
    'Model': model_tuning_list,
    'Model MAE Score': mae_tuning_list,
    'Model RMSE Score': rmse_tuning_list,
    'Model R2 Score': r2_tuning_list,
    'Training Time': training_time_list,
    'Predicting Time': predicting_time_list
})

In [10]:
Models_comparing_table.sort_values(by=['Model R2 Score', 'Training Time'], ascending=[False, True], inplace=True)
Models_comparing_table

Unnamed: 0,Model,Model MAE Score,Model RMSE Score,Model R2 Score,Training Time,Predicting Time
7,XGBRegressor,0.064569,0.085864,0.605838,10.031756,0.009764
9,CatBoost Regressor,0.065087,0.085883,0.605659,5.861225,0.008733
6,Random Forest Regressor,0.063787,0.086971,0.595612,180.665912,0.024274
8,LGBMRegressor,0.063736,0.08718,0.593666,0.710521,0.000508
10,AdaBoost Regressor,0.074919,0.09338,0.53381,3.913133,0.019303
5,Decision Tree,0.065841,0.096972,0.497259,0.191163,0.007304
3,Support Vector Regression,0.078488,0.09895,0.476538,13.1757,0.004474
0,Linear Regression,0.077388,0.100148,0.463786,2.367519,0.004013
2,Ridge,0.077235,0.100523,0.45976,0.057826,0.0
4,K-Neighbors Regressor,0.096424,0.125405,0.159224,0.395833,0.008269


In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    n_estimators: int
    learning_rate: float
    max_depth: float
    subsample: float
    colsample_bytree: float
    target_column: str

In [8]:
from src.utils import read_yaml, create_directories

In [14]:
class ConfigurationManager:
    def __init__(self):
        self.config = read_yaml(Path("config.yml"))
        self.params = read_yaml(Path("params.yml"))
        self.schema = read_yaml(Path("schema.yml"))

        create_directories([self.config.artifacts_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBoost
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            n_estimators = params.n_estimators,
            learning_rate = params.learning_rate,
            max_depth = params.max_depth,
            subsample = params.subsample,
            colsample_bytree = params.colsample_bytree,
            target_column = schema.name
        )
        
        return model_trainer_config

In [11]:
import pandas as pd
import os
import sys
from src.logger import logging
from src.exception import CustomException
import xgboost as xgb
import joblib

In [12]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        X_train = train_data.drop([self.config.target_column], axis=1)
        X_test = test_data.drop([self.config.target_column], axis=1)
        y_train = train_data[self.config.target_column]
        y_test = test_data[self.config.target_column]

        model = xgb.XGBRegressor(
            objective='reg:squarederror',
            random_state=42,
            n_estimators=self.config.n_estimators,
            learning_rate=self.config.learning_rate,
            max_depth=self.config.max_depth,
            subsample=self.config.subsample,
            colsample_bytree=self.config.colsample_bytree
        )
        model.fit(X_train, y_train)

        joblib.dump(model, os.path.join(self.config.root_dir, self.config.model_name))

In [15]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise CustomException(e, sys)