In [127]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor,StackingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error,r2_score
import optuna
import warnings
warnings.filterwarnings("ignore")

In [128]:
train = pd.read_csv(r"C:\Users\tanuj\OneDrive\Desktop\Deep Learning\Practice\Big_Mart_Sales_Prediction\train_v9rqX0R.csv")
test = pd.read_csv(r"Big_Mart_Sales_Prediction\test_AbJTz2l.csv")

In [129]:
df = test.copy()

# **Pre-processing**

In [130]:
def pre_processing(df):
    ###Filling Missing Values
    df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0])
    df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].median())



    ### Feture encoding

    # ----- One Hot encoding ------
    df = pd.get_dummies(df,columns=['Item_Fat_Content','Item_Type','Outlet_Type','Outlet_Identifier','Outlet_Location_Type'],drop_first=True,dtype='int')

    #------- map -------
    d2 = {'Small':1,'Medium':2,'High':3}

    df['Outlet_Size'] = df['Outlet_Size'].map(d2)


    # dropping Unnessary columns
    df.drop(columns = ['Item_Identifier'],inplace=True)
    

    return df


In [131]:
train = pre_processing(train)
test = pre_processing(test)

In [132]:
X = train.drop(columns = "Item_Outlet_Sales")
y = train['Item_Outlet_Sales']


# **OPTUNA with Gradient Boosting**

In [133]:
# Train test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


# Hyperparameter Tuning Using Optuna*
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.5, 0.7, 0.9]),  # Removed 'auto'
        'random_state': 42
    }

    model = GradientBoostingRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    return -scores.mean()

#  Optimize with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=70)

# Get Best Parameters
best_params_gb = study.best_params
print("Best Parameters:", best_params_gb)

# Train Final Model with Optimized Parameters
final_model = GradientBoostingRegressor(**best_params_gb)
final_model.fit(X_train, y_train)

# Predict on Validation & Test Set
y_valid_pred = final_model.predict(X_valid)
y_test_pred = final_model.predict(test)

# Evaluate RMSE
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
print(f"Final RMSE: {rmse:.4f}")


[I 2025-02-17 00:03:40,491] A new study created in memory with name: no-name-6c12a70c-951a-405e-96c9-79b87a8a23e4


[I 2025-02-17 00:03:42,373] Trial 0 finished with value: 1108.1718890312284 and parameters: {'n_estimators': 118, 'max_depth': 5, 'learning_rate': 0.06377015004170335, 'subsample': 0.9155804220616784, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 1108.1718890312284.
[I 2025-02-17 00:03:48,873] Trial 1 finished with value: 1188.9411339341327 and parameters: {'n_estimators': 460, 'max_depth': 6, 'learning_rate': 0.13392044173329234, 'subsample': 0.6670745071554474, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 1108.1718890312284.
[I 2025-02-17 00:03:54,116] Trial 2 finished with value: 1187.9670230978968 and parameters: {'n_estimators': 259, 'max_depth': 9, 'learning_rate': 0.10937004648994128, 'subsample': 0.762623393335776, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 1108.1718890312284.
[I 2025-02-17 00:03:57,495] Trial 3

Best Parameters: {'n_estimators': 110, 'max_depth': 4, 'learning_rate': 0.04766533065971283, 'subsample': 0.768458972526026, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_features': 0.7}
Final RMSE: 1023.9385


In [134]:
gbr = GradientBoostingRegressor(**best_params_gb, random_state=42)
gbr.fit(X,y)

In [135]:
# Predict
y_pred_gb_2 = gbr.predict(test)

# Replace negative predictions with 0
y_pred_gb_2 = np.where(y_pred_gb_2 < 0, 0, y_pred_gb_2)

In [136]:
pd.Series(y_pred_gb_2)

submission = df[['Item_Identifier','Outlet_Identifier']]

submission['Item_Outlet_Sales'] = pd.Series(y_pred_gb_2)

submission.to_csv("Trial_gb_01.csv",index=False)

# **OPTUNA with XGBRegressor**

In [138]:
# ✅ Train-Test Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 **Hyperparameter Tuning Using Optuna**
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'alpha': trial.suggest_float('alpha', 0, 10),
        'lambda': trial.suggest_float('lambda', 0, 10),
        'random_state': 42
    }

    model = XGBRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    return -scores.mean()

# ✅ Optimize with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=70)

# ✅ Get Best Parameters
best_params = study.best_params
print("Best Parameters:", best_params)

# ✅ Train Final Model with Optimized Parameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# ✅ Predict on Validation & Test Set
y_valid_pred = final_model.predict(X_valid)

# ✅ Evaluate RMSE
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
print(f"Final RMSE: {rmse:.4f}")

[I 2025-02-17 00:11:55,861] A new study created in memory with name: no-name-18db2ed0-437b-4a74-8681-57b6b6ae4dca


[I 2025-02-17 00:11:57,092] Trial 0 finished with value: 1161.7562705431387 and parameters: {'n_estimators': 311, 'max_depth': 4, 'learning_rate': 0.15126054454754112, 'subsample': 0.757275498974215, 'colsample_bytree': 0.8465990147214, 'alpha': 9.860738587806232, 'lambda': 0.6169991672658326}. Best is trial 0 with value: 1161.7562705431387.
[I 2025-02-17 00:11:58,329] Trial 1 finished with value: 1170.2479134528808 and parameters: {'n_estimators': 153, 'max_depth': 6, 'learning_rate': 0.19445449348325108, 'subsample': 0.8031688772097925, 'colsample_bytree': 0.8251667823926416, 'alpha': 6.034813185422197, 'lambda': 6.492222528127841}. Best is trial 0 with value: 1161.7562705431387.
[I 2025-02-17 00:12:00,609] Trial 2 finished with value: 1141.4808219248728 and parameters: {'n_estimators': 114, 'max_depth': 10, 'learning_rate': 0.05229883457154081, 'subsample': 0.95503783895043, 'colsample_bytree': 0.9216438783238936, 'alpha': 8.126702190117586, 'lambda': 7.252918978210705}. Best is tri

Best Parameters: {'n_estimators': 244, 'max_depth': 3, 'learning_rate': 0.04310739876703656, 'subsample': 0.9375273130501537, 'colsample_bytree': 0.9504044362953034, 'alpha': 3.643944919771124, 'lambda': 2.4539677700365248}
Final RMSE: 1031.0284


In [139]:
xgb = XGBRegressor(**best_params, random_state=42)
xgb.fit(X,y)

In [140]:
# Predict
y_pred_xgb = xgb.predict(test)

# Replace negative predictions with 0
y_pred_xgb = np.where(y_pred_xgb < 0, 0, y_pred_xgb)

In [141]:
pd.Series(y_pred_xgb)

submission = df[['Item_Identifier','Outlet_Identifier']]

submission['Item_Outlet_Sales'] = pd.Series(y_pred_xgb)

submission.to_csv("Trial_xgb_02.csv",index=False)

# **Applying Stacking Models to optimize more**

In [144]:
stacking_model = StackingRegressor(estimators=[('XGB', xgb), ('GBR', gbr)])

In [145]:
# 📌 Train Model
stacking_model.fit(X, y)

# 📌 Predict on Test Data
y_pred_s = stacking_model.predict(test)

# Replace negative predictions with 0
y_pred_s = np.where(y_pred_s < 0, 0, y_pred_s)

In [146]:
pd.Series(y_pred_s)

submission = df[['Item_Identifier','Outlet_Identifier']]

submission['Item_Outlet_Sales'] = pd.Series(y_pred_s)

submission.to_csv("Trial_03_stack.csv",index=False)

In [150]:
from joblib import dump

dump(gbr,'model.joblib')

['model.joblib']

In [151]:
df = pd.read_csv(r"C:\Users\tanuj\OneDrive\Desktop\Deep Learning\Practice\Big_Mart_Sales_Prediction\train_v9rqX0R.csv")

In [152]:
import joblib

X = df.drop(columns=["Item_Outlet_Sales"])  # Your feature set
X_encoded = pre_processing(X)  # Preprocess as before
joblib.dump(list(X_encoded.columns), "expected_features.pkl")  # Save column names


['expected_features.pkl']

**After Submitted to the BIG MART SALES PREDICTION**
- Trial 1: RMSE = 1152.4213965440
- Trial 2: RMSE = 1151.8486559269
- Trial 3: RMSE = 1150.0762040430