In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import polars as pl

plt.style.use('dark_background')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## **Importing Data**

In [None]:
BASE = '/kaggle/input/store-sales-time-series-forecasting/'

train = pd.read_csv(BASE + "train.csv")
test = pd.read_csv(BASE + "test.csv")
oil = pd.read_csv(BASE + "oil.csv")
stores = pd.read_csv(BASE + "stores.csv")
transactions = pd.read_csv(BASE + "transactions.csv")
holidays_events = pd.read_csv(BASE + "holidays_events.csv")
sample_submission = pd.read_csv(BASE + "sample_submission.csv")


In [None]:
test['test'] = 1
train['test'] = 0
df = pd.concat([train, test], axis=0)

In [None]:
df

In [None]:
df.isna().mean()

In [None]:
df.dtypes

## **Data analysis and vizualization**

In [None]:
def datetime(df):
    df['date'] = pd.to_datetime(df["date"])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.day_of_week
    df['day_name'] = df['date'].dt.day_name()
    df['quarter'] = df['date'].dt.quarter
    df['is_leap_year'] = df['date'].dt.is_leap_year
    return df

In [None]:
df = datetime(df)

In [None]:
df.head()

In [None]:
grouping_columns = ['year', 'month', 'day', 'day_name', 'quarter', 'day_of_week']

fig, axes = plt.subplots(3, 2, figsize=(12, 10))
axes = axes.flatten()

for ind, column in enumerate(grouping_columns):
    grouped_data = df.groupby(column)['sales'].sum()
    grouped_data = pd.DataFrame(grouped_data).reset_index()

    sns.lineplot(data=grouped_data, x=column, y='sales', ax=axes[ind])

plt.tight_layout()
plt.show()

In [None]:
grouped_data = df.groupby('family')['sales'].sum().reset_index()

sns.lineplot(data=grouped_data, x='family', y='sales')

ax = plt.gca()
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()

In [None]:
pd.DataFrame(df.groupby('year')['sales'].sum()).reset_index().sort_values(by='sales', ascending=False)

In [None]:
best_store = df.groupby('store_nbr')['sales'].sum().reset_index()

bs_sorted = best_store.sort_values(by='sales', ascending=False)

sns.lineplot(data=bs_sorted, x='store_nbr', y='sales')

In [None]:
best_city = pd.merge(bs_sorted, stores, on='store_nbr', how='left')

best_city['ratio'] = 0

for city in best_city['city'].unique():
    ratio = len(best_city.loc[best_city['city'] == city]) / len(best_city)
    best_city.loc[best_city['city'] == city, 'ratio'] = ratio

best_city.head()

In [None]:
best_city.sort_values(by='sales', ascending=True).head()

In [None]:
data = best_city.groupby('state')['sales'].sum()

plt.figure(figsize=(15, 15))
plt.pie(data, labels=data.index, autopct='%1.1f%%', startangle=90)

plt.title('Sales Distribution by State')

plt.show()

In [None]:
best_city.groupby('city')['ratio'].apply(lambda x: x[0:1])/best_city.groupby('city')['sales'].sum()

In [None]:
best_city.groupby('city')['ratio'].apply(lambda x: x[0:1])/best_city.groupby('city')['sales'].sum()

plt.figure(figsize=(15, 15))
plt.pie(data, labels=data.index, autopct='%1.1f%%', startangle=90)

plt.title('Sales Distribution by City With Ratio')

plt.show()

In [None]:
data = best_city.groupby('city')['sales'].sum()

plt.figure(figsize=(15, 15))
plt.pie(data, labels=data.index, autopct='%1.1f%%', startangle=90)

plt.title('Sales Distribution by City')

plt.show()

In [None]:
df = pd.merge(df, stores, how='left', on='store_nbr')

In [None]:
df.head()

In [None]:
pd.DataFrame(df.groupby('cluster')['sales'].sum()).reset_index().sort_values(by='sales', ascending=True).head(10)

In [None]:
holidays_events

In [None]:
holidays_events['date'] = pd.to_datetime(holidays_events["date"])
df = pd.merge(df, holidays_events, how='left', on='date')

In [None]:
df_non_nan = df.dropna(subset=['locale'])
df_non_nan.head()

In [None]:
df_non_nan['type_y'].value_counts()

In [None]:
df_non_nan['locale'].value_counts()

In [None]:
df_non_nan.groupby(['type_y', 'family'])['sales'].sum()

In [None]:
grouped_data = df_non_nan.groupby(['type_y'])['sales'].sum()

grouped_data = grouped_data.reset_index()

plt.figure(figsize=(10, 10))
plt.bar(range(len(grouped_data)), grouped_data['sales'])
plt.xlabel('Group')
plt.ylabel('Total Sales')
plt.title('Total Sales by Group')
plt.xticks(range(len(grouped_data)), grouped_data['type_y'], rotation=90, fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
grouped_data = df_non_nan.groupby(['locale'])['sales'].sum()

grouped_data = grouped_data.reset_index()

plt.figure(figsize=(10, 10))
plt.bar(range(len(grouped_data)), grouped_data['sales'])
plt.xlabel('Group')
plt.ylabel('Total Sales')
plt.title('Total Sales by Group')
plt.xticks(range(len(grouped_data)), grouped_data['locale'], rotation=90, fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
df_non_nan[df_non_nan['transferred'] == True].groupby('locale')['sales'].sum()

## **Feature Engineering**

In [None]:
df['type_y'] = df['type_y'].fillna(0)

df['transferred'] = df['transferred'].fillna(0.0).astype(float)

df['locale'] = df['locale'].fillna(0)

categorical_features = ['family', 'day_name', 'city', 'state', 'type_x', 'transferred', 'locale', 'type_y']

df = pd.get_dummies(df, columns=categorical_features)

df['is_leap_year'] = df['is_leap_year'].astype(int)

In [None]:
df.head()

In [None]:
from sklearn.decomposition import PCA


encoded_cities = [
'city_Ambato', 'city_Babahoyo', 'city_Cayambe', 'city_Cuenca', 
'city_Daule', 'city_El Carmen', 'city_Esmeraldas', 'city_Guaranda', 'city_Guayaquil', 'city_Ibarra',
'city_Latacunga', 'city_Libertad', 'city_Loja', 'city_Machala', 'city_Manta', 'city_Playas',
'city_Puyo', 'city_Quevedo', 'city_Quito', 'city_Riobamba', 'city_Salinas', 'city_Santo Domingo',
'state_Azuay', 'state_Bolivar', 'state_Chimborazo', 'state_Cotopaxi', 'state_El Oro', 'state_Esmeraldas',
'state_Guayas', 'state_Imbabura', 'state_Loja', 'state_Los Rios', 'state_Manabi', 'state_Pastaza',
'state_Pichincha', 'state_Santa Elena', 'state_Santo Domingo de los Tsachilas', 'state_Tungurahua'
]

num_components = 5
pca_model = PCA(n_components=num_components)

pca_model.fit(df[encoded_cities])

transformed_data = pca_model.transform(df[encoded_cities])

cols = [f'place_pca_{i}' for i in range(1, 6)]

df.drop(columns = encoded_cities)

df[cols] = transformed_data

## **BaseModel**

In [None]:
df.columns

In [None]:
features = [
'store_nbr', 'onpromotion', 'year', 'month', 'day', 'day_of_week', 'quarter', 'is_leap_year', 
'cluster', 'family_AUTOMOTIVE', 'family_BABY CARE', 'family_BEAUTY',
'family_BEVERAGES', 'family_BOOKS', 'family_BREAD/BAKERY', 'family_CELEBRATION', 'family_CLEANING',
'family_DAIRY', 'family_DELI', 'family_EGGS', 'family_FROZEN FOODS', 'family_GROCERY I',
'family_GROCERY II', 'family_HARDWARE', 'family_HOME AND KITCHEN I', 'family_HOME AND KITCHEN II',
'family_HOME APPLIANCES', 'family_HOME CARE', 'family_LADIESWEAR', 'family_LAWN AND GARDEN',
'family_LINGERIE', 'family_LIQUOR,WINE,BEER', 'family_MAGAZINES', 'family_MEATS',
'family_PERSONAL CARE', 'family_PET SUPPLIES', 'family_PLAYERS AND ELECTRONICS', 'family_POULTRY',
'family_PREPARED FOODS', 'family_PRODUCE', 'family_SCHOOL AND OFFICE SUPPLIES', 'family_SEAFOOD',
'place_pca_1', 'place_pca_2', 'place_pca_3',
'place_pca_4', 'place_pca_5', 'type_x_A', 'type_x_B', 'type_x_C', 'type_x_D', 
'type_x_E', 'transferred_0.0', 'transferred_1.0','locale_0', 'locale_Local', 
'locale_National', 'locale_Regional', 'type_y_0', 'type_y_Additional',
'type_y_Bridge', 'type_y_Event', 'type_y_Holiday', 'type_y_Transfer', 'type_y_Work Day'
]

target = 'sales'

In [None]:
from sklearn.model_selection import train_test_split

df_pl = pl.DataFrame(df)

train = df_pl.filter(pl.col('test') == 0)
test = df_pl.filter(pl.col('test') == 1)

train = train.to_pandas()
test = test.to_pandas()

X_train, X_test, y_train, y_test = train.loc[train['year'] != 2017][features], train.loc[train['year'] == 2017][features], train.loc[train['year'] != 2017][target],train.loc[train['year'] == 2017][target]                                 

In [None]:
from xgboost import XGBRegressor
import catboost as cb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error

scaler = StandardScaler()

model_cb = cb.CatBoostRegressor(verbose=False)
model_xgb = XGBRegressor(verbosity=0)

pipeline_cb = Pipeline([
    ('scaler', scaler),
    ('model_cb', model_cb) 
])

pipeline_xgb = Pipeline([
    ('scaler', scaler), 
    ('model_xgb', model_xgb)
])

pipeline_cb.fit(X_train, y_train)
pipeline_xgb.fit(X_train, y_train)

y_pred_cb = pipeline_cb.predict(X_test)
y_pred_xgb = pipeline_xgb.predict(X_test)

y_pred_combined = 0.5 * y_pred_cb + 0.5 * y_pred_xgb

msle = mean_squared_log_error(abs(y_test), abs(y_pred_combined), squared=False)
print("Mean Squared Log Error:", msle)

## **Submission**

In [None]:
sample_submission['sales'] = 0.5 * pipeline_cb.predict(test[features]) + 0.5 * pipeline_xgb.predict(test[features]) 

In [None]:
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', index=False)