In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays_events = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

After importing all the required datasets it is time to focus on the preprocessing of the datasets.
Preprocessing of Datasets involves 2 major things:
1. Looking if it contains any null values
2. Conversion of Datatypes into forms that will be acceptible to the model for analysis.

In [3]:
train.isnull().sum()
test.isnull().sum()

id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64

As of now there are no null values in the datasets so we are going to merge the datasets but we will have to convert categorical features into a form that the model accepts and convert data regarding dates to datetime.

In [4]:
train = pd.merge(train, stores, how='left', on='store_nbr')
train = pd.merge(train, transactions, how='left', on=['date', 'store_nbr'])
train = pd.merge(train, oil, how='left', on='date')
train = pd.merge(train, holidays_events, how='left', on='date')

test = pd.merge(test, stores, how='left', on='store_nbr')
test = pd.merge(test, transactions, how='left', on=['date', 'store_nbr'])
test = pd.merge(test, oil, how='left', on='date')
test = pd.merge(test, holidays_events, how='left', on='date')

Now we have merged the training and testing datasets with features of interests.
Now is time to feature engineer and convert date into day of week.

In [5]:
train['day_of_week'] = pd.to_datetime(train['date']).dt.dayofweek
test['day_of_week'] = pd.to_datetime(test['date']).dt.dayofweek

Now let's check if our data is skewed in any manner....

Now we see that a tail is getting formed in the datset....meaning that the data is skewed or concenterated on one timeframe...we are going to fix this by using a log(1+x) function...it provides weight to smaller values while at the dame time lowering the larger values.

In [6]:
train['log_sales'] = np.log1p(train['sales'])

Now that we have dealt with the skewness of the dataset we are going to do feature selection for training our model.
But as some of the features may have categorical values that the model won't accept we will use get dummies functions for converting these values.

In [7]:
features = ['store_nbr', 'onpromotion', 'day_of_week', 'cluster', 'dcoilwtico']

X_train = pd.get_dummies(train[features])
y_train = train['log_sales']

X_train = X_train.dropna()
y_train = y_train[X_train.index]  
X_test = pd.get_dummies(test[features])

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Now the model we are going to use is GradientBoostingRegressor model... And calculate the rmsle value as it is the criteria for this competition.

In [8]:
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_predict = model.predict(X_valid)

rmsle = np.sqrt(mean_squared_error(y_valid, y_predict))

Now a new problem that arises with encoding categorical data is that both the training and test dataset should have identical features and data count otherwise the encoding will be incorrect and leading to model rejecting the input.
So to counter this problem we will recreate each column missing in test dataset with all it's value set to zero...

In [9]:
features_train = ['onpromotion', 'cluster', 'dcoilwtico', 'store_nbr', 'day_of_week']

X_Test_encoded = pd.get_dummies(X_test)

missing_cols = set(X_train.columns) - set(X_Test_encoded.columns)
for col in missing_cols:
    X_Test_encoded[col] = 0

Now that there will be null values in test dataset we will use an imputer to fill those null values.

In [10]:
imputer = SimpleImputer()
X_Test_encoded = pd.DataFrame(imputer.fit_transform(X_Test_encoded), columns=X_Test_encoded.columns)
print(X_Test_encoded.isnull().sum())

store_nbr      0
onpromotion    0
day_of_week    0
cluster        0
dcoilwtico     0
dtype: int64


In [11]:
features_test = ['store_nbr', 'onpromotion', 'day_of_week', 'cluster', 'dcoilwtico']
X_Test = X_test[features]

# Predictions on the test set
y_predict_test = model.predict(X_Test_encoded)

In [12]:
submission = pd.DataFrame({
    'id': test['id'],
    'sales': y_predict_test
})
submission.to_csv('submission.csv', index=False)

In [13]:
display(submission.head())

Unnamed: 0,id,sales
0,3000888,2.396544
1,3000889,2.396544
2,3000890,4.970418
3,3000891,6.784168
4,3000892,2.396544


Reference: https://www.kaggle.com/code/bravo03/store-sales-time-series-forecasting?kernelSessionId=160694314