In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [24]:
# Load data
train_df = pd.read_csv("train_v9rqX0R.csv")
test_df = pd.read_csv("test_AbJTz2l.csv")

In [25]:
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [26]:
train_df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
# Add placeholder for target in test set
test_df['Item_Outlet_Sales'] = None

In [6]:
# Add source flags
train_df['source'] = 'train'
test_df['source'] = 'test'

In [7]:
# Combine train and test for uniform preprocessing
combined_df = pd.concat([train_df, test_df], ignore_index=True)


  combined_df = pd.concat([train_df, test_df], ignore_index=True)


In [8]:
# Fix categorical inconsistencies
combined_df['Item_Fat_Content'] = combined_df['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})


In [9]:
# Bivariate Imputation
combined_df['Item_Weight'] = combined_df.groupby('Item_Type')['Item_Weight'].transform(
    lambda x: x.fillna(x.mean())
)

combined_df['Outlet_Size'] = combined_df.groupby(
    ['Outlet_Type', 'Outlet_Location_Type']
)['Outlet_Size'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Small'))


In [10]:
# Impute any remaining missing numeric values (safety)
num_cols = combined_df.select_dtypes(include=['float64', 'int64']).columns
num_imputer = SimpleImputer(strategy='mean')
combined_df[num_cols] = num_imputer.fit_transform(combined_df[num_cols])


In [11]:
# Impute any remaining missing categorical values (safety)
cat_cols = combined_df.select_dtypes(include='object').columns
cat_imputer = SimpleImputer(strategy='most_frequent')
combined_df[cat_cols] = cat_imputer.fit_transform(combined_df[cat_cols])


In [12]:
# Label Encoding (excluding ID/source)
categorical_cols = combined_df.select_dtypes(include='object').columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['Item_Identifier', 'source']]
le = LabelEncoder()
for col in categorical_cols:
    combined_df[col] = le.fit_transform(combined_df[col])


In [13]:

# Split back to train and test
train_df = combined_df[combined_df['source'] == 'train'].drop('source', axis=1)
test_df = combined_df[combined_df['source'] == 'test'].drop(['source', 'Item_Outlet_Sales'], axis=1)


In [14]:
# Save processed data
train_df.to_csv("processed_train.csv", index=False)
test_df.to_csv("processed_test.csv", index=False)

print("✅ Preprocessing Complete!")

✅ Preprocessing Complete!


In [15]:
from pycaret.regression import setup, compare_models, tune_model, finalize_model, save_model, predict_model
import pandas as pd


In [16]:
df = pd.read_csv("processed_train.csv")
df.shape


(8523, 12)

In [17]:
s = setup(
    data=df,
    target='Item_Outlet_Sales',
    session_id=123,
    verbose=False  # optional: can still be used in v3 to reduce logs
)



In [18]:
best = compare_models()


In [19]:
tuned = tune_model(best, 
                   search_library='scikit-optimize', 
                   search_algorithm='bayesian', 
                   optimize='R2')


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,874.4609,1574126.9787,1254.6422,0.5334,0.617,0.6636
1,787.6962,1246728.8145,1116.5701,0.5498,0.6638,0.717
2,839.1641,1623746.1539,1274.263,0.4016,0.635,0.6487
3,883.665,1631786.5504,1277.414,0.4782,0.6157,0.6609
4,819.053,1424220.1182,1193.4069,0.4907,0.6412,0.6197
5,880.7474,1607881.5876,1268.0227,0.4283,0.6346,0.6845
6,861.6818,1507967.1578,1227.9931,0.4506,0.6246,0.6096
7,862.0253,1473455.3209,1213.8597,0.4515,0.624,0.6795
8,871.406,1406539.1447,1185.976,0.4986,0.6375,0.702
9,824.9558,1333968.992,1154.9758,0.5148,0.5887,0.6006


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [20]:
final_model = finalize_model(tuned)
save_model(final_model, 'bigmart_best_model')


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Item_Weight', 'Item_Fat_Content',
                                              'Item_Visibility', 'Item_Type',
                                              'Item_MRP', 'Outlet_Identifier',
                                              'Outlet_Establishment_Year',
                                              'Outlet_Size',
                                              'Outlet_Location_Type',
                                              'Outlet_Type'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['...
                  TransformerWrapper(include=['Item_Identifier'],
                                     transformer=TargetEncoder(cols=['Item_Identifier'],
                                                               handle_missing='return_nan'))),
     