In [1]:
!pip install catboost

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from google.colab import drive

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/MyDrive/ML_dataSets_Colab/Sales_Forcasting/Train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/ML_dataSets_Colab/Sales_Forcasting/Test.csv')

Mounted at /content/drive


In [3]:
# Replace empty strings with NaN
train_df.replace('', np.nan, inplace=True)
test_df.replace('', np.nan, inplace=True)

# Standardize Item_Fat_Content
fat_content_map = {'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'}
train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].replace(fat_content_map)
test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].replace(fat_content_map)

# Impute missing Item_Weight using mapping from Item_Identifier
item_weight_map = train_df.groupby('Item_Identifier')['Item_Weight'].mean().to_dict()
overall_mean_weight = train_df['Item_Weight'].mean()

def impute_item_weight(row):
    if pd.isna(row['Item_Weight']):
        return item_weight_map.get(row['Item_Identifier'], overall_mean_weight)
    else:
        return row['Item_Weight']

train_df['Item_Weight'] = train_df.apply(impute_item_weight, axis=1)
test_df['Item_Weight'] = test_df.apply(impute_item_weight, axis=1)

# Impute missing Outlet_Size with mode from training data
outlet_size_mode = train_df['Outlet_Size'].mode()[0]
train_df['Outlet_Size'].fillna(outlet_size_mode, inplace=True)
test_df['Outlet_Size'].fillna(outlet_size_mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Outlet_Size'].fillna(outlet_size_mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Outlet_Size'].fillna(outlet_size_mode, inplace=True)


In [4]:
# Define features and target
features = [col for col in train_df.columns if col not in ['Item_Identifier', 'Item_Outlet_Sales']]
target = 'Item_Outlet_Sales'

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)

print(X_train.head())

      Item_Weight Item_Fat_Content  Item_Visibility              Item_Type  \
549         9.500          Regular         0.035206  Fruits and Vegetables   
7757       18.000          Low Fat         0.047473              Household   
764        17.600          Regular         0.076122                   Meat   
6867        8.325          Low Fat         0.029845  Fruits and Vegetables   
2716       12.850          Low Fat         0.137228            Snack Foods   

      Item_MRP Outlet_Identifier  Outlet_Establishment_Year Outlet_Size  \
549   171.3448            OUT049                       1999      Medium   
7757  170.5422            OUT045                       2002      Medium   
764   111.7202            OUT046                       1997       Small   
6867   41.6138            OUT045                       2002      Medium   
2716  155.5630            OUT046                       1997       Small   

     Outlet_Location_Type        Outlet_Type  
549                Tier 1  Superm

In [5]:
# Define categorical features
cat_features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
                'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# Initialize and train the model
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    cat_features=cat_features,
    verbose=0
)
model.fit(X_train, y_train)

# Make predictions on validation set
val_predictions = model.predict(X_val)

from sklearn.metrics import mean_squared_error, r2_score

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f'Validation RMSE: {rmse}')

r2 = r2_score(y_val, val_predictions)
print(f'Validation R² Score: {r2:.4f}')

Validation RMSE: 1029.2242794418664
Validation R² Score: 0.6103


In [15]:
# Prepare entire training data
X_full_train = train_df[features]
y_full_train = train_df[target]

# Train the model on entire training data
model.fit(X_full_train, y_full_train)

<catboost.core.CatBoostRegressor at 0x7e79aadaa890>

In [16]:
x_test = test_df[features]
test_predictions = model.predict(x_test)

In [17]:
# Create submission DataFrame
submission = test_df[['Item_Identifier', 'Outlet_Identifier']].copy()
submission['Item_Outlet_Sales'] = test_predictions

# Save to CSV
submission.to_csv('/content/drive/MyDrive/ML_dataSets_Colab/Sales_Forcasting/submission.csv', index=False)

# Optional: display first few rows
print(submission.head())

  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1775.349010
1           FDW14            OUT017        1416.955075
2           NCN55            OUT010         592.362227
3           FDQ58            OUT017        2563.852263
4           FDY38            OUT027        6167.378643


In [20]:
# Save your trained Keras model
model.save_model("catboost_sales_model.cbm")

# Download it to your machine
from google.colab import files
files.download("catboost_sales_model.cbm")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
import pickle
# Save tokenizer
with open("feature_names.pkl", "wb") as f:
    pickle.dump(features, f)

# Download tokenizer
files.download("feature_names.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>