In [2]:
# Import necessary libraries
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
# ### Read CSV Files
# Load the training and testing datasets
df = pd.read_csv('big_mart_sales.csv')

# Limpando os dados

In [4]:
# ### Handle Missing Values in 'Item_Weight'
# The 'Item_Weight' column has missing values that need to be addressed.
# Group by 'Item_Identifier' and calculate descriptive statistics to find a suitable replacement for missing values
weight_statistics = df.groupby('Item_Identifier').agg(
    mean=('Item_Weight', 'mean'),
    std=('Item_Weight', 'std'),
    min=('Item_Weight', 'min'),
    q25=('Item_Weight', lambda x: x.quantile(0.25)),
    q50=('Item_Weight', 'median'),
    q75=('Item_Weight', lambda x: x.quantile(0.75)),
    max=('Item_Weight', 'max')
)

# ### Remove Irrelevant Records
# Identify items with a single record that have a null weight
# Since there are only 4 such records in over 8k, they will be removed
records_to_remove = ['FDN52', 'FDK57', 'FDE52', 'FDQ60']
df = df[~df['Item_Identifier'].isin(records_to_remove)]

# ### Fill Missing Values with Means
# Replace null values in 'Item_Weight' with the mean weight of each item
mean_weight_per_item = df.groupby('Item_Identifier')['Item_Weight'].transform('mean')
df['Item_Weight'] = df['Item_Weight'].fillna(mean_weight_per_item)

In [5]:
# ### Standardize 'Item_Fat_Content' Categories
# Unify categories that represent the same concept
category_replacements = {
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
}
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(category_replacements)

In [6]:
# ### Consolidate Rare Categories in 'Item_Type'
# Replace categories with fewer than 200 records as they may be underrepresented
df.loc[df['Item_Type'].isin(['Starchy Foods', 'Breakfast', 'Seafood']), 'Item_Type'] = 'Others'

In [7]:
# There are items with visibility equal to 0, which seems strange, but I will not modify them.
# The item may not be displayed in the store, and these items are unique per outlet, so there isn't a suitable metric to replace this value.

# Some outlets don't have a size value, and this will remain null since we have no way of determining the actual size of the outlet.

# Tratando os dados

In [9]:
# Copy the cleaned data to preserve the raw dataset
processed_train_data = df.copy()

In [10]:
# Standardizing numerical features. While XGBoost is robust to different scales, standardization is applied for potential category importance analysis and model tuning.
# Standardization is also chosen over normalization because it is less sensitive to outliers.
scaler = StandardScaler()
processed_train_data[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']] = scaler.fit_transform(
    processed_train_data[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']])

In [11]:
# Encoding categorical columns since XGBoost does not natively handle categorical data.
# Label Encoding is applied to binary columns like 'Item_Fat_Content' and 'Outlet_Size' to capture their binary nature efficiently.
le = LabelEncoder()
processed_train_data['Item_Fat_Content'] = le.fit_transform(processed_train_data['Item_Fat_Content'])
processed_train_data['Outlet_Size'] = le.fit_transform(processed_train_data['Outlet_Size'])

In [12]:
# One-hot encoding is applied to 'Item_Type' to handle multiple categories, avoiding imposing any ordinal relationship between them. XGBoost works well with binary inputs.
processed_train_data = pd.get_dummies(processed_train_data, columns=['Item_Type'], drop_first=True)

In [13]:
# Simplifying 'Outlet_Location_Type' by removing the "Tier" text and converting to integer.
processed_train_data['Outlet_Location_Type'] = processed_train_data['Outlet_Location_Type'].str.replace('Tier ', '', regex=False).astype(int)

In [14]:
# One-hot encoding is also applied to 'Outlet_Type' to handle multiple categories, ensuring no ordinal relationship is inferred between them.
processed_train_data = pd.get_dummies(processed_train_data, columns=['Outlet_Type'], drop_first=True)

# Model

In [27]:
# Separando as categoria previsoras da que será prevista
# Removendo Identificadores pois geralmente não tem relação direta com as vendas e removendo a coluna q será prevista
x = processed_train_data.drop(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'], axis = 1)
# Isolando a coluna q será prevista
y = processed_train_data['Item_Outlet_Sales']

In [28]:
# Dividindo os dados em 80% para treino e 20% para teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=321)

In [29]:
# Treino do Modelo XGB
model = xgb.XGBRegressor()
model.fit(x_train, y_train, eval_set = [(x_train, y_train)])

[0]	validation_0-rmse:1411.56420
[1]	validation_0-rmse:1237.48781
[2]	validation_0-rmse:1135.92002
[3]	validation_0-rmse:1080.56084
[4]	validation_0-rmse:1047.38316
[5]	validation_0-rmse:1026.19549
[6]	validation_0-rmse:1012.79838
[7]	validation_0-rmse:998.81379
[8]	validation_0-rmse:987.32075
[9]	validation_0-rmse:980.60007
[10]	validation_0-rmse:971.95163
[11]	validation_0-rmse:958.71601
[12]	validation_0-rmse:953.59379
[13]	validation_0-rmse:945.28447
[14]	validation_0-rmse:938.28884
[15]	validation_0-rmse:929.63055
[16]	validation_0-rmse:920.36660
[17]	validation_0-rmse:914.02620
[18]	validation_0-rmse:910.37550
[19]	validation_0-rmse:907.31251
[20]	validation_0-rmse:899.94975
[21]	validation_0-rmse:892.48725
[22]	validation_0-rmse:886.67234
[23]	validation_0-rmse:880.55072
[24]	validation_0-rmse:874.20424
[25]	validation_0-rmse:869.73386
[26]	validation_0-rmse:861.12661
[27]	validation_0-rmse:853.29038
[28]	validation_0-rmse:847.53957
[29]	validation_0-rmse:843.16199
[30]	validati

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In [30]:
# Predição dos valores 
prediction = model.predict(x_test)

In [31]:
# Métricas do RMSE do modelo
rmse = np.sqrt(np.mean((y_test - prediction) ** 2))
rmse

1185.604463598415