In [1]:

# XGBoost Modeling for Big Mart Sales

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For scaling, splitting, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# XGBoost
!pip install xgboost
import xgboost as xgb


# 1. LOAD DATA

train = pd.read_csv("C:/Users/somas/Documents/BigMart Sales Prediction/train_v9rqX0R.csv")
test_original = pd.read_csv("C:/Users/somas/Documents/BigMart Sales Prediction/test_AbJTz2l.csv")
test = test_original.copy()


# 2. DATA CLEANING

train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
test['Item_Weight'].fillna(test['Item_Weight'].median(), inplace=True)

train['Outlet_Size'].fillna('Unknown', inplace=True)
test['Outlet_Size'].fillna('Unknown', inplace=True)

train['Item_Fat_Content'].replace({'LF':'Low Fat','low fat':'Low Fat','reg':'Regular'}, inplace=True)
test['Item_Fat_Content'].replace({'LF':'Low Fat','low fat':'Low Fat','reg':'Regular'}, inplace=True)

# Zero visibility => median
visibility_median = train.groupby('Item_Identifier')['Item_Visibility'].median()
train.loc[train['Item_Visibility']==0,'Item_Visibility'] = \
    train['Item_Identifier'].map(visibility_median)
test.loc[test['Item_Visibility']==0,'Item_Visibility'] = \
    test['Item_Identifier'].map(visibility_median)


# 3. FEATURE ENGINEERING

# Outlet_Age
train['Outlet_Age'] = 2023 - train['Outlet_Establishment_Year']
test['Outlet_Age'] = 2023 - test['Outlet_Establishment_Year']
train.drop('Outlet_Establishment_Year', axis=1, inplace=True)
test.drop('Outlet_Establishment_Year', axis=1, inplace=True)

# Aggregator features
item_mean_sales = train.groupby('Item_Identifier')['Item_Outlet_Sales'].mean().reset_index()
item_mean_sales.columns = ['Item_Identifier','Item_Mean_Sales']

outlet_mean_sales = train.groupby('Outlet_Identifier')['Item_Outlet_Sales'].mean().reset_index()
outlet_mean_sales.columns = ['Outlet_Identifier','Outlet_Mean_Sales']

train = pd.merge(train, item_mean_sales, on='Item_Identifier', how='left')
train = pd.merge(train, outlet_mean_sales, on='Outlet_Identifier', how='left')

test = pd.merge(test, item_mean_sales, on='Item_Identifier', how='left')
test = pd.merge(test, outlet_mean_sales, on='Outlet_Identifier', how='left')

global_mean = train['Item_Outlet_Sales'].mean()
test['Item_Mean_Sales'].fillna(global_mean, inplace=True)
test['Outlet_Mean_Sales'].fillna(global_mean, inplace=True)


# 4. LOG TRANSFORM THE TARGET

train['Log_Sales'] = np.log1p(train['Item_Outlet_Sales'])


# 5. ENCODE CATEGORICAL

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
cat_cols = [
    'Item_Identifier','Item_Fat_Content','Item_Type',
    'Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type'
]

for col in cat_cols:
    train[col] = encoder.fit_transform(train[col])
    test[col]  = encoder.transform(test[col])


# 6. FINAL FEATURE SETS

feature_cols = [
    'Item_Identifier','Item_Weight','Item_Fat_Content','Item_Visibility',
    'Item_Type','Item_MRP','Outlet_Identifier','Outlet_Size',
    'Outlet_Location_Type','Outlet_Type','Outlet_Age',
    'Item_Mean_Sales','Outlet_Mean_Sales'
]

X = train[feature_cols]
y = train['Log_Sales']


# 7. TRAIN-VALID SPLIT

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 8. SCALING

scaler = StandardScaler()
num_cols = [
    'Item_Weight','Item_Visibility','Item_MRP',
    'Outlet_Age','Item_Mean_Sales','Outlet_Mean_Sales'
]

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])
test[num_cols] = scaler.transform(test[num_cols])


# 9. XGBOOST MODELLING

# Basic param (tune as needed)
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_estimators=300,   # tune
    max_depth=8,        # tune
    learning_rate=0.05, # tune
    subsample=0.8,      # tune
    colsample_bytree=0.8
)

# Train on train split
xgb_model.fit(X_train, y_train)

# Predict in log space
valid_preds_log = xgb_model.predict(X_valid)
valid_preds = np.expm1(valid_preds_log)
valid_actuals = np.expm1(y_valid)

rmse = np.sqrt(mean_squared_error(valid_actuals, valid_preds))
print(f'[XGBoost] Validation RMSE: {rmse:.4f}')


# 10. FINAL TRAINING ON FULL DATA

# Re-scale entire X
X[num_cols] = scaler.fit_transform(X[num_cols])
xgb_model.fit(X, y)

# Predict on test
test_preds_log = xgb_model.predict(test[feature_cols])
test_preds = np.expm1(test_preds_log)


# 11. SUBMISSION

submission = test_original[['Item_Identifier','Outlet_Identifier']].copy()
submission['Item_Outlet_Sales'] = test_preds

sub_path = os.path.join(os.getcwd(), 'XGB_BigMart_Sales.csv')
submission.to_csv(sub_path, index=False)
print(f'[XGBoost] Submission file saved: {sub_path}')


Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 1.0/124.9 MB 7.2 MB/s eta 0:00:18
    --------------------------------------- 2.1/124.9 MB 6.5 MB/s eta 0:00:19
   - -------------------------------------- 3.9/124.9 MB 7.1 MB/s eta 0:00:17
   -- ------------------------------------- 6.3/124.9 MB 8.2 MB/s eta 0:00:15
   -- ------------------------------------- 7.6/124.9 MB 8.5 MB/s eta 0:00:14
   --- ------------------------------------ 9.7/124.9 MB 8.3 MB/s eta 0:00:14
   --- ------------------------------------ 10.7/124.9 MB 7.7 MB/s eta 0:00:15
   --- ------------------------------------ 11.0/124.9 MB 7.2 MB/s eta 0:00:16
   --- ------------------------------------ 11.0/124.9 MB 7.2 MB/s eta 0:00:16
   --- ------------------------------------ 11.3/124.9 MB 6.2 MB/s eta 0:00:

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Item_Weight'].fillna(test['Item_Weight'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the 

[XGBoost] Validation RMSE: 1078.4308


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = scaler.fit_transform(X[num_cols])


[XGBoost] Submission file saved: C:\Users\somas\Documents\BigMart Sales Prediction\XGB_BigMart_Sales.csv
