# Importing packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV, TimeSeriesSplit

# Importing the data

In [81]:
data_path = r"C:\Users\titou\Desktop\python_test_files\ML_classic\hackathon\store_sales_forecasting\data"

input_data_df = pd.read_csv(data_path + r'\train.csv')
#test_data_df = pd.read_csv(data_path + r'\test.csv')
holiday_df = pd.read_csv(data_path + r'\holidays_events.csv')
oil_df = pd.read_csv(data_path + r'\oil.csv')
stores_df = pd.read_csv(data_path + r'\stores.csv')
#transaction_df = pd.read_csv(data_path + r'\transactions.csv')

In [50]:
print("input data:", input_data_df.info(), "\n")
print("holiday:",holiday_df.info(), "\n")
print("oil:",oil_df.info(), "\n")
print("stores:",stores_df.info(), "\n")
#print("transaction:",transaction_df.info(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB
input data: None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         350 non-null    object
 1   type         350 non-null    object
 2   locale       350 non-null    object
 3   locale_name  350 non-null    object
 4   description  350 non-null    object
 5   transferred  350 non-null    bool  
dtypes: bool(1), object(5)
memory usage: 14.1+ KB
holiday: None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 col

In [None]:
input_data_extended_df = pd.merge

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [52]:
input_data_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [53]:
holiday_df.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [60]:
stores_df.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [54]:
oil_df.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


# Data Cleaning

### Cleaning holiday

In [82]:
holiday_df_clean = holiday_df[["date","type", "transferred"]].copy()
holiday_df_clean[["type", "transferred"]] = holiday_df[["type", "transferred"]].astype("category")
holiday_df_clean["type"] = holiday_df_clean["type"].cat.codes + 1
holiday_df_clean["transferred"] = holiday_df_clean["transferred"].cat.codes

In [98]:
holiday_df_clean = holiday_df_clean.rename(columns = {"type":"holiday_type","transferred": "holiday_transferred"})
holiday_df_clean

Unnamed: 0,date,holiday_type,holiday_transferred
0,2012-03-02,4,0
1,2012-04-01,4,0
2,2012-04-12,4,0
3,2012-04-14,4,0
4,2012-04-21,4,0
...,...,...,...
345,2017-12-22,1,0
346,2017-12-23,1,0
347,2017-12-24,1,0
348,2017-12-25,4,0


### Cleaning oil

In [None]:
oil_df_clean = oil_df.ffill().bfill()


date          False
dcoilwtico     True
dtype: bool

### Cleaning store

In [118]:
stores_df_clean = stores_df.copy()
stores_df_clean[["city", "state", "type", "cluster"]] = stores_df_clean[["city", "state", "type", "cluster"]].astype("category")
for col in ["city", "state", "type", "cluster"]:
    stores_df_clean[col] = stores_df_clean[col].cat.codes

# Data merging

In [125]:
print(input_data_df.isna().any(axis=0))
input_data_df.head()

id             False
date           False
store_nbr      False
family         False
sales          False
onpromotion    False
dtype: bool


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [129]:
print(holiday_df_clean.isna().any(axis=0))
holiday_df_clean.head()

date                   False
holiday_type           False
holiday_transferred    False
dtype: bool


Unnamed: 0,date,holiday_type,holiday_transferred
0,2012-03-02,4,0
1,2012-04-01,4,0
2,2012-04-12,4,0
3,2012-04-14,4,0
4,2012-04-21,4,0


In [126]:
print(oil_df_clean.isna().any(axis=0))
oil_df_clean.head()

date          False
dcoilwtico    False
dtype: bool


Unnamed: 0,date,dcoilwtico
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [130]:
print(stores_df_clean.isna().any(axis=0))
stores_df_clean.head()

store_nbr    False
city         False
state        False
type         False
cluster      False
dtype: bool


Unnamed: 0,store_nbr,city,state,type,cluster
0,1,18,12,3,12
1,2,18,12,3,12
2,3,18,12,3,7
3,4,18,12,3,8
4,5,21,14,3,3


In [144]:
input_data_merged_df = input_data_df.merge(holiday_df_clean, on="date", how="left")
input_data_merged_df = input_data_merged_df.fillna(0)
input_data_merged_df.isna().any(axis=0)

id                     False
date                   False
store_nbr              False
family                 False
sales                  False
onpromotion            False
holiday_type           False
holiday_transferred    False
dtype: bool

In [146]:
input_data_merged_df = input_data_merged_df.merge(oil_df_clean, on="date", how="left")
input_data_merged_df["dcoilwtico"] = input_data_merged_df["dcoilwtico"].ffill()
input_data_merged_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,holiday_type,holiday_transferred,dcoilwtico
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,4.0,0.0,93.14
1,1,2013-01-01,1,BABY CARE,0.0,0,4.0,0.0,93.14
2,2,2013-01-01,1,BEAUTY,0.0,0,4.0,0.0,93.14
3,3,2013-01-01,1,BEVERAGES,0.0,0,4.0,0.0,93.14
4,4,2013-01-01,1,BOOKS,0.0,0,4.0,0.0,93.14


In [147]:
input_data_merged_df.isna().any(axis=0)

id                     False
date                   False
store_nbr              False
family                 False
sales                  False
onpromotion            False
holiday_type           False
holiday_transferred    False
dcoilwtico             False
dtype: bool

In [151]:
input_data_merged_df = input_data_merged_df.merge(stores_df_clean, on="store_nbr", how="left")
input_data_merged_df.isna().any(axis=0)

id                     False
date                   False
store_nbr              False
family                 False
sales                  False
onpromotion            False
holiday_type           False
holiday_transferred    False
dcoilwtico             False
city                   False
state                  False
type                   False
cluster                False
dtype: bool

In [160]:
input_data_merged_df["family"] = input_data_merged_df["family"].astype("category")
input_data_merged_df["family"] = input_data_merged_df["family"].cat.codes
input_data_merged_df = input_data_merged_df.drop(columns=["date", "id"])

KeyError: "['date'] not found in axis"

In [165]:
input_data_merged_df.head()

Unnamed: 0,store_nbr,family,sales,onpromotion,holiday_type,holiday_transferred,dcoilwtico,city,state,type,cluster
0,1,0,0.0,0,4.0,0.0,93.14,18,12,3,12
1,1,1,0.0,0,4.0,0.0,93.14,18,12,3,12
2,1,2,0.0,0,4.0,0.0,93.14,18,12,3,12
3,1,3,0.0,0,4.0,0.0,93.14,18,12,3,12
4,1,4,0.0,0,4.0,0.0,93.14,18,12,3,12


# Building the model

In [172]:
X = input_data_merged_df.drop(columns="sales").values
y = input_data_merged_df["sales"].values

In [173]:
X.shape

(3054348, 10)

In [174]:
y.shape

(3054348,)