In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# CatBoost

!pip install catboost
from catboost import CatBoostRegressor


# 1. LOAD DATA

train = pd.read_csv('C:/Users/somas/Documents/BigMart Sales Prediction/train_v9rqX0R.csv')
test_original = pd.read_csv('C:/Users/somas/Documents/BigMart Sales Prediction/test_AbJTz2l.csv')
test = test_original.copy()  # We'll transform 'test' but keep 'test_original' for final submission


# 2. DATA CLEANING

# Missing values
train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
test['Item_Weight'].fillna(test['Item_Weight'].median(), inplace=True)

train['Outlet_Size'].fillna('Unknown', inplace=True)
test['Outlet_Size'].fillna('Unknown', inplace=True)

# Standardize Item_Fat_Content
train['Item_Fat_Content'].replace(
    {'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, 
    inplace=True
)
test['Item_Fat_Content'].replace(
    {'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, 
    inplace=True
)

# Zero Item_Visibility => median by Item_Identifier
visibility_median = train.groupby('Item_Identifier')['Item_Visibility'].median()
train.loc[train['Item_Visibility'] == 0, 'Item_Visibility'] = \
    train['Item_Identifier'].map(visibility_median)
test.loc[test['Item_Visibility'] == 0, 'Item_Visibility'] = \
    test['Item_Identifier'].map(visibility_median)


# 3. FEATURE ENGINEERING

# Outlet Age
train['Outlet_Age'] = 2023 - train['Outlet_Establishment_Year']
test['Outlet_Age'] = 2023 - test['Outlet_Establishment_Year']

train.drop('Outlet_Establishment_Year', axis=1, inplace=True)
test.drop('Outlet_Establishment_Year', axis=1, inplace=True)

# ============ Aggregator Features (mean sales) ============
item_mean_sales = train.groupby('Item_Identifier')['Item_Outlet_Sales'].mean().reset_index()
item_mean_sales.columns = ['Item_Identifier','Item_Mean_Sales']

outlet_mean_sales = train.groupby('Outlet_Identifier')['Item_Outlet_Sales'].mean().reset_index()
outlet_mean_sales.columns = ['Outlet_Identifier','Outlet_Mean_Sales']

train = pd.merge(train, item_mean_sales, on='Item_Identifier', how='left')
train = pd.merge(train, outlet_mean_sales, on='Outlet_Identifier', how='left')

test = pd.merge(test, item_mean_sales, on='Item_Identifier', how='left')
test = pd.merge(test, outlet_mean_sales, on='Outlet_Identifier', how='left')

# Fill aggregator nans with global mean if mismatch
global_mean = train['Item_Outlet_Sales'].mean()
test['Item_Mean_Sales'].fillna(global_mean, inplace=True)
test['Outlet_Mean_Sales'].fillna(global_mean, inplace=True)


# 4. LOG TRANSFORM THE TARGET

train['Log_Sales'] = np.log1p(train['Item_Outlet_Sales'])


# 5. DEFINE FEATURES

# We'll keep these categorical columns as strings for CatBoost
cat_cols = [
    'Item_Identifier','Item_Fat_Content','Item_Type',
    'Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type'
]

# Drop the old target + ID columns not needed
# But keep aggregator features & numeric
feature_cols = [
    'Item_Identifier','Item_Weight','Item_Fat_Content','Item_Visibility',
    'Item_Type','Item_MRP','Outlet_Identifier','Outlet_Size',
    'Outlet_Location_Type','Outlet_Type','Outlet_Age',
    'Item_Mean_Sales','Outlet_Mean_Sales'
]

X = train[feature_cols]
y = train['Log_Sales']


# 6. TRAIN-VALID SPLIT

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 7. SCALING SELECT NUMERIC FEATURES

num_cols = [
    'Item_Weight','Item_Visibility','Item_MRP','Outlet_Age',
    'Item_Mean_Sales','Outlet_Mean_Sales'
]

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])


# 8. CATBOOST MODEL

# We'll define cat_features indices for the columns that are categorical in X
cat_features_indices = [X_train.columns.get_loc(c) for c in cat_cols]

cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    cat_features=cat_features_indices,
    verbose=100
)

cat_model.fit(X_train, y_train)

# Predict in log space
y_valid_log_preds = cat_model.predict(X_valid)

# Convert back to normal space
y_valid_preds = np.expm1(y_valid_log_preds)
y_valid_true  = np.expm1(y_valid)

rmse = np.sqrt(mean_squared_error(y_valid_true, y_valid_preds))
print(f'[CatBoost] Validation RMSE: {rmse:.4f}')


# 9. RETRAIN ON FULL DATA & GENERATE SUBMISSION

# Scale entire dataset's numeric features
X[num_cols] = scaler.fit_transform(X[num_cols])

cat_model_full = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    cat_features=cat_features_indices,
    verbose=100
)
cat_model_full.fit(X, y)

# Prepare test data
test[num_cols] = scaler.transform(test[num_cols])  # scale numeric
test_preds_log = cat_model_full.predict(test[feature_cols])
test_preds = np.expm1(test_preds_log)


# 10. CREATE SUBMISSION

submission = test_original[['Item_Identifier', 'Outlet_Identifier']].copy()
submission['Item_Outlet_Sales'] = test_preds

# Save
project_dir = os.getcwd()
submission_file_path = os.path.join(project_dir, "CatBoost_BigMart_Sales.csv")

submission.to_csv(submission_file_path, index=False)
print(f'✅ Submission file saved at: {submission_file_path}')




Collecting catboost
  Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.8/101.7 MB 6.6 MB/s eta 0:00:16
   - -------------------------------------- 2.9/101.7 MB 8.4 MB/s eta 0:00:12
   - -------------------------------------- 4.5/101.7 MB 8.4 MB/s eta 0:00:12
   -- ------------------------------------- 6.6/101.7 MB 8.9 MB/s eta 0:00:11
   -- ------------------------------------- 6.8/101.7 MB 7.1 MB/s eta 0:00:14
   -- ------------------------------------- 7.1/101.7 MB 6.1 MB/s eta 0:00:16
   -- ------------------------------------- 7.3/101.7 MB 5.3 MB/s eta 0:00:18
   --- ------------------------------------ 7.9/101.7 MB 5.0 MB/s eta 0:00:19
   --- ------------------------------------ 8.1/

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Item_Weight'].fillna(test['Item_Weight'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the 

0:	learn: 0.9758427	total: 235ms	remaining: 3m 55s
100:	learn: 0.4708302	total: 8.21s	remaining: 1m 13s
200:	learn: 0.4547424	total: 16.2s	remaining: 1m 4s
300:	learn: 0.4397046	total: 25.1s	remaining: 58.2s
400:	learn: 0.4257195	total: 33.7s	remaining: 50.4s
500:	learn: 0.4104213	total: 42.6s	remaining: 42.5s
600:	learn: 0.3965969	total: 51.3s	remaining: 34.1s
700:	learn: 0.3839023	total: 1m	remaining: 25.6s
800:	learn: 0.3727576	total: 1m 8s	remaining: 17.1s
900:	learn: 0.3624139	total: 1m 17s	remaining: 8.55s
999:	learn: 0.3518672	total: 1m 25s	remaining: 0us
[CatBoost] Validation RMSE: 969.5584


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = scaler.fit_transform(X[num_cols])


0:	learn: 0.9797500	total: 68.1ms	remaining: 1m 8s
100:	learn: 0.4718061	total: 7.31s	remaining: 1m 5s
200:	learn: 0.4548956	total: 14.4s	remaining: 57.2s
300:	learn: 0.4401419	total: 22s	remaining: 51.2s
400:	learn: 0.4257114	total: 29.6s	remaining: 44.1s
500:	learn: 0.4137041	total: 36.5s	remaining: 36.3s
600:	learn: 0.4021009	total: 43.4s	remaining: 28.8s
700:	learn: 0.3913208	total: 50.3s	remaining: 21.5s
800:	learn: 0.3810425	total: 57.2s	remaining: 14.2s
900:	learn: 0.3708054	total: 1m 4s	remaining: 7.04s
999:	learn: 0.3609914	total: 1m 10s	remaining: 0us
✅ Submission file saved at: C:\Users\somas\Documents\BigMart Sales Prediction\CatBoost_BigMart_Sales.csv


"\nfrom sklearn.model_selection import RepeatedKFold, cross_val_score\n\nrkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)\n\ncat_model_cv = CatBoostRegressor(\n    iterations=500,\n    learning_rate=0.05,\n    depth=8,\n    random_seed=42,\n    cat_features=cat_features_indices,\n    verbose=0\n)\n\n# cross_val_score expects negative rmse, so we do:\nscores = cross_val_score(\n    cat_model_cv, X, y,\n    scoring='neg_root_mean_squared_error',\n    cv=rkf,\n    n_jobs=-1\n)\nmean_rmse = -scores.mean()\nprint(f'[CatBoost CV] RepeatedKFold Mean RMSE: {mean_rmse:.4f}')\n"