In [24]:
# 📦 Imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [25]:
# 📂 Load datasets
train_df = pd.read_csv("train_v9rqX0R.csv")
test_df = pd.read_csv("test_AbJTz2l.csv")

In [26]:
# 🧪 Add placeholder target in test set
test_df['Item_Outlet_Sales'] = None

In [27]:

# 🏷️ Add source column
train_df['source'] = 'train'
test_df['source'] = 'test'

In [28]:
# 🔀 Combine for uniform preprocessing
combined_df = pd.concat([train_df, test_df], ignore_index=True)

In [29]:
# 🧹 Drop ID columns
combined_df.drop(['Item_Identifier', 'Outlet_Identifier','Item_Visibility'], axis=1, inplace=True)


In [30]:
# 🩹 Fix categorical inconsistencies
combined_df['Item_Fat_Content'] = combined_df['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})

In [31]:
# ⚖️ Impute Item_Weight using Item_Type group mean
combined_df['Item_Weight'] = combined_df.groupby('Item_Type')['Item_Weight'].transform(
    lambda x: x.fillna(x.mean())
)

In [32]:
# 🏬 Impute Outlet_Size using mode from Outlet_Type and Location group
combined_df['Outlet_Size'] = combined_df.groupby(
    ['Outlet_Type', 'Outlet_Location_Type']
)['Outlet_Size'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Small'))


In [33]:
# 🧼 Safety impute numeric columns
num_cols = combined_df.select_dtypes(include=['float64', 'int64']).columns
num_imputer = SimpleImputer(strategy='mean')
combined_df[num_cols] = num_imputer.fit_transform(combined_df[num_cols])


In [34]:

# 🧼 Safety impute categorical columns
cat_cols = combined_df.select_dtypes(include='object').columns
cat_imputer = SimpleImputer(strategy='most_frequent')
combined_df[cat_cols] = cat_imputer.fit_transform(combined_df[cat_cols])


In [35]:
# 🔤 Label Encoding (exclude source)
categorical_cols = [col for col in cat_cols if col != 'source']
le = LabelEncoder()
for col in categorical_cols:
    combined_df[col] = le.fit_transform(combined_df[col])


In [36]:
# 🔁 Split back into train and test
final_train = combined_df[combined_df['source'] == 'train'].drop('source', axis=1)
final_test = combined_df[combined_df['source'] == 'test'].drop(['source', 'Item_Outlet_Sales'], axis=1)


In [37]:
# 💾 Save processed data
final_train.to_csv("processed_train.csv", index=False)
final_test.to_csv("processed_test.csv", index=False)

print("✅ Preprocessing complete. Files saved as 'processed_train.csv' and 'processed_test.csv'")


✅ Preprocessing complete. Files saved as 'processed_train.csv' and 'processed_test.csv'


In [38]:
import pandas as pd
from pycaret.regression import setup, compare_models, tune_model, finalize_model, save_model


In [39]:
# Load preprocessed data
train_df = pd.read_csv("processed_train.csv")


In [40]:
# Initialize PyCaret regression setup
s = setup(
    data=train_df,
    target='Item_Outlet_Sales',
    session_id=123,
    verbose=False
)

In [42]:
# Compare and select best model
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,757.2546,1166436.289,1078.7933,0.5916,0.5542,0.5631,0.291
lightgbm,Light Gradient Boosting Machine,779.0081,1240902.6535,1112.2834,0.5657,0.5541,0.5682,0.121
rf,Random Forest Regressor,805.6717,1348019.2991,1159.0041,0.5283,0.5597,0.5753,0.989
lr,Linear Regression,908.3328,1464613.6,1209.1848,0.4872,0.7498,0.9998,0.027
lasso,Lasso Regression,908.2335,1464614.225,1209.1866,0.4872,0.7479,0.9992,0.017
ridge,Ridge Regression,908.3195,1464612.3625,1209.1843,0.4872,0.7502,0.9997,0.017
llar,Lasso Least Angle Regression,908.2334,1464614.0,1209.1865,0.4872,0.7479,0.9992,0.016
br,Bayesian Ridge,908.1841,1464619.8125,1209.1872,0.4872,0.748,0.9991,0.018
lar,Least Angle Regression,911.1156,1470306.4625,1211.5346,0.4852,0.7546,1.0084,0.015
ada,AdaBoost Regressor,939.1972,1472254.662,1212.1507,0.4826,0.8311,1.2887,0.1


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [43]:
# Tune the best model
tuned_model = tune_model(
    best_model,
    search_library='scikit-optimize',
    search_algorithm='bayesian',
    optimize='R2'
)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [44]:
# Finalize and save
final_model = finalize_model(tuned_model)
save_model(final_model, 'bigmart_best_model')

print("✅ Model trained and saved as 'bigmart_best_model.pkl'")

Transformation Pipeline and Model Successfully Saved
✅ Model trained and saved as 'bigmart_best_model.pkl'
