<a href="https://colab.research.google.com/github/Pk22012003/SalesForecaster/blob/main/bigmartXGRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Load dataset
big_mart_data = pd.read_csv('/content/Train.csv')

In [None]:
# Handle missing values
big_mart_data['Item_Weight'] = big_mart_data['Item_Weight'].fillna(big_mart_data['Item_Weight'].mean())
mode_outlet_size = big_mart_data.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=lambda x: x.mode()[0])
missing = big_mart_data['Outlet_Size'].isnull()
big_mart_data.loc[missing, 'Outlet_Size'] = big_mart_data.loc[missing, 'Outlet_Type'].apply(lambda x: mode_outlet_size[x])


In [None]:
# Normalize Item_Fat_Content
big_mart_data.replace({'Item_Fat_Content': {'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}}, inplace=True)

In [None]:
# Feature Engineering
big_mart_data['Item_Category'] = big_mart_data['Item_Identifier'].apply(lambda x: x[:2])
big_mart_data['MRP_Cluster'] = pd.cut(big_mart_data['Item_MRP'], bins=[0, 70, 140, 200, 300], labels=[1, 2, 3, 4])
big_mart_data['Outlet_Years'] = 2025 - big_mart_data['Outlet_Establishment_Year']
big_mart_data['Item_Outlet_Sales_Log'] = np.log1p(big_mart_data['Item_Outlet_Sales'])

In [None]:
# Encode categorical variables
le = LabelEncoder()
cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type',
        'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type',
        'Outlet_Type', 'Item_Category']
for col in cols:
    big_mart_data[col] = le.fit_transform(big_mart_data[col])
big_mart_data['MRP_Cluster'] = le.fit_transform(big_mart_data['MRP_Cluster'].astype(str))

In [None]:
# Log transform target
big_mart_data['Item_Outlet_Sales'] = np.log1p(big_mart_data['Item_Outlet_Sales'])

In [None]:
# Features and target
X = big_mart_data.drop(['Item_Identifier', 'Item_Outlet_Sales'], axis=1)
y = big_mart_data['Item_Outlet_Sales']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

In [None]:
# Base models
xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
rf_model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', random_state=42)

In [None]:
# KFold setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
n_train, n_test = X_train.shape[0], X_test.shape[0]
blend_train = np.zeros((n_train, 2))
blend_test = np.zeros((n_test, 2))

In [None]:
# Generate stacked features
for i, model in enumerate([xgb_model, rf_model]):
    test_pred_fold = np.zeros((n_test, kf.n_splits))
    for j, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        model.fit(X_tr, y_tr)
        blend_train[val_idx, i] = model.predict(X_val)
        test_pred_fold[:, j] = model.predict(X_test)
    blend_test[:, i] = test_pred_fold.mean(axis=1)

In [None]:
# Meta-model
meta_model = Ridge(alpha=1.0)
meta_model.fit(blend_train, y_train)
final_pred_log = meta_model.predict(blend_test)
final_pred = np.expm1(final_pred_log)
y_test_actual = np.expm1(y_test)

In [None]:
# Metrics
mse = mean_squared_error(y_test_actual, final_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_actual, final_pred)
r2 = r2_score(y_test_actual, final_pred)
mape = np.mean(np.abs((y_test_actual - final_pred) / y_test_actual)) * 100

In [None]:
print(f"Model Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R2 Score: {r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Model Performance Metrics:
Mean Squared Error (MSE): 2237.7683737691495
Root Mean Squared Error (RMSE): 47.30505653489011
Mean Absolute Error (MAE): 15.877124383847619
R2 Score: 0.9991766765935893
Mean Absolute Percentage Error (MAPE): 0.93%
