In [2]:
import os 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

In [3]:
# changing the directory
os.chdir("../")
%pwd

'C:\\Users\\hp\\Pictures\\Big_mart_sales'

In [4]:
train_data=pd.read_csv('notebooks/pickle and trained data/preprocessed_train_data.csv')
test_data=pd.read_csv('notebooks/pickle and trained data/preprocessed_test_data.csv')

In [5]:
train_x=train_data.drop(['Item_Outlet_Sales'],axis=1)
train_y=train_data['Item_Outlet_Sales']
test_x=test_data.drop(['Item_Outlet_Sales'],axis=1)
test_y=test_data['Item_Outlet_Sales']

## Model Training

In [7]:
from sklearn.model_selection import  cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor , HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error,r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import AdaBoostRegressor


## Model_Training



In [9]:
def rmse(y_true, y_pred):
    """
    Calculate the Root Mean Squared Error (RMSE).
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [10]:


def train_and_predict(model, X_train, y_train, X_test, y_test):
    """
    Train the model on training data, make predictions on both training and test datasets,
    and calculate the error metrics (RMSE and R²).

    Args:
        model: The machine learning model to be trained.
        X_train: Features of the training set.
        y_train: Target variable of the training set.
        X_test: Features of the test set.
        y_test: Target variable of the test set.

    Returns:
        y_train_pred: Predictions for the training set.
        y_test_pred: Predictions for the test set.
        model: The trained model.
        train_rmse: RMSE for the training set.
        test_rmse: RMSE for the test set.
        train_r2: R² for the training set.
        test_r2: R² for the test set.
    """
    # Train the model
    model.fit(X_train, y_train)

    # Predict on training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate RMSE (Root Mean Squared Error)
    train_rmse = rmse(y_train, y_train_pred)
    test_rmse = rmse(y_test, y_test_pred)

    # Calculate R² score
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Print error metrics
    print(f"Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
    print(f"Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}")

    # Return predictions, error metrics, and trained model
    return y_train_pred, y_test_pred, model, train_rmse, test_rmse, train_r2, test_r2


In [11]:
train_and_predict(LinearRegression(), train_x, train_y, test_x, test_y)

Train RMSE: 1141.4831, Test RMSE: 1069.8566
Train R²: 0.5595, Test R²: 0.5789


(array([2915.30213996, 2664.7137757 , 1784.63942699, ..., 3741.07278809,
        1938.53858874, 1534.80940478]),
 array([1377.22583145,  698.24683611,  890.61037116, ...,  859.72309159,
         574.13183474, 1712.60659923]),
 LinearRegression(),
 1141.4830894243353,
 1069.8566182690477,
 0.5595126723576831,
 0.5788794005819018)

In [12]:
train_and_predict(DecisionTreeRegressor(), train_x, train_y, test_x, test_y)

Train RMSE: 0.0000, Test RMSE: 1521.1544
Train R²: 1.0000, Test R²: 0.1487


(array([2386.2272, 3103.9596, 1125.202 , ..., 6145.334 , 1649.8524,
         965.41  ]),
 array([ 402.809 ,  479.376 , 1216.4166, ..., 1216.4166,  377.5086,
        1310.2944]),
 DecisionTreeRegressor(),
 0.0,
 1521.1544363962473,
 1.0,
 0.1486621402537014)

In [13]:

train_and_predict(RandomForestRegressor(), train_x, train_y, test_x, test_y)

Train RMSE: 425.9158, Test RMSE: 1072.6443
Train R²: 0.9387, Test R²: 0.5767


(array([2607.07306 , 2981.339214, 1382.247406, ..., 5444.326496,
        1659.286786, 1186.795158]),
 array([ 959.610882,  787.74127 ,  822.709086, ...,  801.676464,
         648.595728, 1441.543554]),
 RandomForestRegressor(),
 425.91580300693914,
 1072.6442579348127,
 0.9386743777878829,
 0.5766819810812974)

In [14]:
train_and_predict(GradientBoostingRegressor(), train_x, train_y, test_x, test_y)

Train RMSE: 1039.6103, Test RMSE: 1037.7474
Train R²: 0.6346, Test R²: 0.6038


(array([2949.18867694, 2841.29234574, 1661.25824279, ..., 3944.53067851,
        1910.18316175, 1533.01836422]),
 array([1343.96538953,  691.79510098,  765.37105818, ...,  749.72991684,
         673.92624331, 1643.55905564]),
 GradientBoostingRegressor(),
 1039.6103175225023,
 1037.7473737701903,
 0.6346277019965068,
 0.6037779720262129)

In [15]:
train_and_predict(AdaBoostRegressor(), train_x, train_y, test_x, test_y)

Train RMSE: 1251.4736, Test RMSE: 1212.6141
Train R²: 0.4705, Test R²: 0.4590


(array([3429.77864852, 3215.6378298 , 1982.60245967, ..., 4346.37466204,
        2281.27757662, 1760.94703754]),
 array([1447.48700183, 1165.65458428, 1165.65458428, ..., 1165.65458428,
        1056.3068431 , 2145.61856422]),
 AdaBoostRegressor(),
 1251.473605045952,
 1212.6141219545245,
 0.4705342846714349,
 0.45899587827546984)

In [16]:
train_and_predict(XGBRegressor(), train_x, train_y, test_x, test_y)

Train RMSE: 553.3589, Test RMSE: 1111.7636
Train R²: 0.8965, Test R²: 0.5452


(array([2736.5752, 3134.9272, 1414.7827, ..., 5176.3335, 1529.4888,
        1664.8395], dtype=float32),
 array([1214.5975 ,  965.4787 ,  945.63153, ...,  656.35474, 1196.7532 ,
        1424.6592 ], dtype=float32),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 553.3589116715312,
 1111.7

In [17]:

train_and_predict(HistGradientBoostingRegressor(), train_x, train_y, test_x, test_y)

Train RMSE: 869.1222, Test RMSE: 1058.6668
Train R²: 0.7446, Test R²: 0.5876


(array([2758.99564753, 2724.45234126, 1705.57635672, ..., 4909.43097167,
        1833.85219885, 1537.44444293]),
 array([1248.91897069,  683.43599501,  694.3247244 , ...,  786.04042542,
         603.133507  , 1608.23357731]),
 HistGradientBoostingRegressor(),
 869.1221711443889,
 1058.6668326626077,
 0.7446380932926133,
 0.5876424555421988)

## Summary
- we can say for above results Gradient boosting  is performing well compare to another models so we tune the model for better results


In [21]:
# Define the GradientBoostingRegressor model
gbr = GradientBoostingRegressor()

# Define the hyperparameter grid to tune
param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'loss': ['ls', 'huber', 'absolute_error']
}
# Set up randomSearchCV
random_search = RandomizedSearchCV(estimator=gbr, param_distributions=param_grid,n_iter=10, cv=5, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit random search
random_search.fit(train_x, train_y)

# Get the best parameters and the best model
print("Best Hyperparameters:", random_search.best_params_)
best_rf = random_search.best_estimator_

# Evaluate on test data
test_rmse = np.sqrt(-random_search.score(test_x,test_y))  # RMSE from neg_mean_squared_error scoring
print(f'Test RMSE: {test_rmse}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3.x\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\anaconda3.x\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\hp\anaconda3.x\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\hp\anaconda3.x\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterErro

Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 5, 'loss': 'huber', 'learning_rate': 0.2}
Test RMSE: 1045.9663190881909


random forest:Best Hyperparameters: {'bootstrap': True, 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Test RMSE: 1049.931056226055

gradient boosting=Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 9, 'loss': 'absolute_error', 'learning_rate': 0.2}
Test RMSE: 1064.0094990492717

In [27]:


# Initialize the GradientBoostingRegressor with the specified hyperparameters
gbr = GradientBoostingRegressor(
    'bootstrap'=True,
    'max_depth'= 30,
    'max_features'= 'log2',
    'min_samples_leaf'= 2,
    'min_samples_split'= 2,
    'n_estimators'= 100
) 

# Fit the model on the training data
gbr.fit(train_x, train_y)

# Predict on training data
predict_value_train = gbr.predict(train_x)

# Calculate RMSE and R² on the training data
train_rmse = np.sqrt(mean_squared_error(train_y, predict_value_train))
train_r2 = r2_score(train_y, predict_value_train)
print(f'Train RMSE: {train_rmse}')
print(f'Train R²: {train_r2}')

# Predict on test data
test_predict = gbr.predict(test_x)

# Calculate RMSE and R² on the test data (assuming you have test_y as the true values)
test_rmse = np.sqrt(mean_squared_error(test_y, test_predict))
test_r2 = r2_score(test_y, test_predict)
print(f'Test RMSE: {test_rmse}')
print(f'Test R²: {test_r2}')


Train RMSE: 969.5338911471888
Train R²: 0.6822244752046414
Test RMSE: 1057.6249115445685
Test R²: 0.5884537260882967


In [31]:


# Initialize the GradientBoostingRegressor with the specified hyperparameters
rf = RandomForestRegressor(
    n_estimators=100,                # Number of trees in the forest
    min_samples_split=2,            # Minimum number of samples required to split an internal node
    min_samples_leaf=2,            # Minimum number of samples required at each leaf node
    max_features='log2',            # Number of features to consider for the best split
    max_depth=30,                    # Maximum depth of the tree
    bootstrap=True                  # Whether to use bootstrap samples
)

# Fit the model on the training data
rf.fit(train_x, train_y)

# Predict on training data
predict_value_train =rf.predict(train_x)

# Calculate RMSE and R² on the training data
train_rmse = np.sqrt(mean_squared_error(train_y, predict_value_train))
train_r2 = r2_score(train_y, predict_value_train)
print(f'Train RMSE: {train_rmse}')
print(f'Train R²: {train_r2}')

# Predict on test data
test_predict = rf.predict(test_x)

# Calculate RMSE and R² on the test data (assuming you have test_y as the true values)
test_rmse = np.sqrt(mean_squared_error(test_y, test_predict))
test_r2 = r2_score(test_y, test_predict)
print(f'Test RMSE: {test_rmse}')
print(f'Test R²: {test_r2}')


Train RMSE: 763.4573659491014
Train R²: 0.8029556284917818
Test RMSE: 1046.9612959680212
Test R²: 0.5967108068193415


In [None]:
n_estimators=659, max_depth=5, min_samples_split=7, min_samples_leaf=5, 
                                       oob_score=True, n_jobs=-1)

In [35]:
# Initialize the GradientBoostingRegressor with the specified hyperparameters
rf = RandomForestRegressor(
    n_estimators=659, max_depth=5, min_samples_split=7, min_samples_leaf=5, 
                                       oob_score=True, n_jobs=-1)                 


# Fit the model on the training data
rf.fit(train_x, train_y)

# Predict on training data
predict_value_train =rf.predict(train_x)

# Calculate RMSE and R² on the training data
train_rmse = np.sqrt(mean_squared_error(train_y, predict_value_train))
train_r2 = r2_score(train_y, predict_value_train)
print(f'Train RMSE: {train_rmse}')
print(f'Train R²: {train_r2}')

# Predict on test data
test_predict = rf.predict(test_x)

# Calculate RMSE and R² on the test data (assuming you have test_y as the true values)
test_rmse = np.sqrt(mean_squared_error(test_y, test_predict))
test_r2 = r2_score(test_y, test_predict)
print(f'Test RMSE: {test_rmse}')
print(f'Test R²: {test_r2}')

Train RMSE: 1064.8398556270176
Train R²: 0.6166786133744738
Test RMSE: 1019.1116866000187
Test R²: 0.6178807697254921


In [53]:

# Initialize the GradientBoostingRegressor with the specified hyperparameters
gbr = GradientBoostingRegressor(n_estimators=533, learning_rate=.0214, max_depth=4, min_samples_leaf=4,min_samples_split=3)

# Fit the model on the training data
gbr.fit(train_x, train_y)

# Predict on training data
predict_value_train = gbr.predict(train_x)

# Calculate RMSE and R² on the training data
train_rmse = np.sqrt(mean_squared_error(train_y, predict_value_train))
train_r2 = r2_score(train_y, predict_value_train)
print(f'Train RMSE: {train_rmse}')
print(f'Train R²: {train_r2}')

# Predict on test data
test_predict = gbr.predict(test_x)

# Calculate RMSE and R² on the test data (assuming you have test_y as the true values)
test_rmse = np.sqrt(mean_squared_error(test_y, test_predict))
test_r2 = r2_score(test_y, test_predict)
print(f'Test RMSE: {test_rmse}')
print(f'Test R²: {test_r2}')


Train RMSE: 987.6750148843141
Train R²: 0.6702213088916251
Test RMSE: 1044.0149504806225
Test R²: 0.5989774757954465
