In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
import pandas as pd
pd.options.plotting.backend = "plotly"
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.graphics import tsaplots
from sklearn.multioutput import RegressorChain, MultiOutputRegressor

# Model 1 (trend)
from pyearth import Earth
from sklearn.linear_model import ElasticNet, Lasso, Ridge

# Model 2
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [3]:
holidays_events = pd.read_csv(
    '/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv',
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')

store_sales = pd.read_csv(
   '/kaggle/input/store-sales-time-series-forecasting/train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
sales = store_sales.copy()
store_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2013-01-01,0.000000,0
1,AUTOMOTIVE,2013-01-02,2.000000,0
1,AUTOMOTIVE,2013-01-03,3.000000,0
1,AUTOMOTIVE,2013-01-04,3.000000,0
1,AUTOMOTIVE,2013-01-05,5.000000,0
...,...,...,...,...
9,SEAFOOD,2017-08-11,23.830999,0
9,SEAFOOD,2017-08-12,16.859001,4
9,SEAFOOD,2017-08-13,20.000000,0
9,SEAFOOD,2017-08-14,17.000000,0


In [4]:
sales.reset_index(inplace=True)
sales['dayofweek'] = sales['date'].dt.dayofweek
# sales

In [5]:
seasonal = sales.groupby(['family','dayofweek']).sum().reset_index()
# seasonal

In [6]:

dayOfWeekfig = go.Figure()
for fam in seasonal['family'].unique():
    x = seasonal[seasonal['family'] == fam]['dayofweek'].values.tolist()
    y = seasonal[seasonal['family'] == fam]['sales'].values.tolist()
    dayOfWeekfig.add_trace(go.Scatter(x=x, y=y, name= fam))
    
dayOfWeekfig.update_layout(title='Sales per family', xaxis_title='Day of Week', yaxis_title='Sales(M)')
dayOfWeekfig.show()

In [7]:
average_sales = (sales.groupby('date').mean().squeeze())
# average_sales

In [8]:
x = average_sales.reset_index()['date'].astype('str').values.tolist()
y = average_sales['sales'].values.tolist()
trendfig = go.Figure()
trendfig.add_trace(go.Scatter(x=x, y=y, name='Average Sales'))
trendfig.update_layout(title='Sales', xaxis_title='Date', yaxis_title='Sales(M)')
trendfig.show()

# Trend

In [9]:
moving_avg_365 = average_sales['sales'].rolling(window=365, center=False).mean().reset_index()
# moving_avg_365

In [10]:
x = moving_avg_365['date'].astype('str').values.tolist()
y = moving_avg_365['sales'].values.tolist()
trendfig.add_trace(go.Scatter(x=x, y=y, name='Moving Average 365'))
trendfig.show()

In [11]:
dp = DeterministicProcess(
    index=average_sales.index,  # dates from the training data
    constant=True,       # dummy feature for the bias (y_intercept)
    order=1,             # the time dummy (trend)
    drop=True,           # drop terms if necessary to avoid collinearity
)
# `in_sample` creates features for the dates given in the `index` argument
X = dp.in_sample()
y = average_sales['sales']
# X.head()

In [12]:
lr = LinearRegression(fit_intercept=False)
lr.fit(X,y)

y_pred = pd.Series(lr.predict(X), index=X.index)
# y_pred.head()

In [13]:
X = dp.out_of_sample(steps=180)
y_fore = pd.Series(lr.predict(X), index=X.index).reset_index().set_axis(['date', 'sales'], axis=1)
# y_fore

In [14]:
x = y_fore['date'].astype('str').values.tolist()
y = y_fore['sales'].values.tolist()
trendfig.add_trace(go.Scatter(x=x, y=y, name='Prediction'))
trendfig.show()

In [15]:
detrend = (average_sales.reset_index()['sales']-moving_avg_365['sales'])
detrend = detrend.set_axis(average_sales.index).reset_index()

x = detrend['date'].astype('str').values.tolist()
y = detrend['sales'].values.tolist()
detrendfig = go.Figure()
detrendfig.add_trace(go.Scatter(x=x,y=y))
detrendfig.update_layout(title='Detrend')
detrendfig.show()



# Seasonality

In [16]:
fourier = CalendarFourier(freq="A", order=10)  # 10 sin/cos pairs for "A"nnual seasonality

dp = DeterministicProcess(
    index=average_sales.index,
    constant=True,   # dummy feature for bias (y-intercept)
    order=1,         # trend ( order 1 means linear)
    seasonal=True,   # weekly seasonality (indicators)
    additional_terms=[fourier], # annual seasonality
    drop=True,       # drop terms to avoid collinearity
)

X = dp.in_sample() # create features for dates in tunnel.index
y = average_sales["sales"]
#X.head()

In [17]:
lr = LinearRegression().fit(X,y)
y_pred = pd.Series(lr.predict(X), index=X.index)
y_deseason = y - y_pred

X_fore = dp.out_of_sample(steps=180)
y_fore = pd.Series(lr.predict(X_fore), index=X_fore.index).reset_index().set_axis(['date', 'sales'], axis=1)

seasonalfig = go.Figure()
seasonalfig.add_trace(go.Scatter(x=y.reset_index()['date'].astype('str').values.tolist(), 
                                 y=y.values.tolist(),
                                 mode='lines+markers',
                                 name='Average Sales'))
seasonalfig.add_trace(go.Scatter(x=y_pred.reset_index()['date'].astype('str').values.tolist(),
                                 y=y_pred.values.tolist(),
                                 name='seasonal'))
seasonalfig.add_trace(go.Scatter(x=y_deseason.reset_index()['date'].astype('str').values.tolist(),
                                 y=y_deseason.values.tolist(),
                                 name='deseasonal'))
seasonalfig.add_trace(go.Scatter(x=y_fore['date'].astype('str').values.tolist(),
                                 y=y_fore['sales'].values.tolist(),
                                 name='predict'))
seasonalfig.update_layout(title='Seasonal')
seasonalfig.show()


In [18]:
family_sales = (
    store_sales
    .groupby(['store_nbr','family', 'date'])
    .mean() 
    .unstack(['family', 'store_nbr'])
    .loc['2017', ['sales','onpromotion']]
)
# family_sales = (
#     store_sales
#     .groupby(['family', 'date'])
#     .mean() 
#     .unstack('family')
#     .loc['2017', ['sales','onpromotion']]
# )

# display(family_sales)
# display(store_sales)
df_sales = family_sales.loc(axis=1)[:, 'MAGAZINES']
display(df_sales)

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,...,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
family,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,...,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES
store_nbr,1,10,11,12,13,14,15,16,17,18,...,5,50,51,52,53,54,6,7,8,9
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-02,1.0,1.0,5.0,3.0,1.0,4.0,1.0,1.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-03,1.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-04,6.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-05,6.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,4.0,1.0,1.0,2.0,0.0,2.0,2.0,4.0,8.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-12,6.0,0.0,2.0,0.0,0.0,4.0,2.0,2.0,8.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-13,4.0,1.0,1.0,1.0,1.0,3.0,2.0,0.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-14,8.0,0.0,3.0,3.0,1.0,4.0,3.0,2.0,6.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# y = df_sales.loc[:, 'sales'].squeeze()
# display(y)
# fourier = CalendarFourier(freq='M', order=4)
# dp = DeterministicProcess(
#     constant=True,
#     index=y.index,
#     order=1,
#     seasonal=True,
#     drop=True,
#     additional_terms=[fourier],
# )

# X_time = dp.in_sample()
# X_time['NewYearsDay'] = (X_time.index.dayofyear == 1)

# lr = LinearRegression(fit_intercept=False)
# lr.fit(X_time, y)
# y_deseason = y - lr.predict(X_time)

In [20]:
# pacf_plot = tsaplots.plot_pacf(y_deseason, lags=8)

# Hybrid Models

In [21]:
# Target series
y = family_sales.loc[:, 'sales']
# display(y)

# X_1: Features for Linear Regression
fourier = CalendarFourier(freq='D', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=5,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X_1 = dp.in_sample()


# X_2: Features for XGBoost
X_2 = family_sales.drop('sales', axis=1).stack()  # onpromotion feature
# display(X_2)
# Label encoding for 'family'
le = LabelEncoder()  # from sklearn.preprocessing
X_2 = X_2.stack()
X_2 = X_2.reset_index(['family', 'store_nbr'])

X_2['family'] = le.fit_transform(X_2['family'])

# Label encoding for seasonality
X_2["day"] = X_2.index.day  # values are day of the month

# display(X_1)
# display(X_2)

In [22]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method
    
    def fit(self, X_1, X_2, y):
        # Train model_1
        self.model_1.fit(X_1, y)

        # Make predictions
        y_fit = pd.DataFrame(
            self.model_1.predict(X_1), 
            index=X_1.index, 
            columns=y.columns,
        )

        # Compute residuals
        y_resid = y - y_fit
        y_resid = y_resid.stack().squeeze() # wide to long
#         display(y_resid.stack())
        # Train model_2 on residuals
        self.model_2.fit(X_2, y_resid.stack())

        # Save column names for predict method
        self.y_columns = y.columns
        # Save data for question checking
        self.y_fit = y_fit
        self.y_resid = y_resid
        
    def predict(self, X_1, X_2):
        # Predict with model_1
        y_pred = pd.DataFrame(
            self.model_1.predict(X_1), 
            index=X_1.index, columns=self.y_columns,
        )
#         display('model 1 predict', self.model_1.predict(X_1), self.model_1.predict(X_1).shape)
#         display('y_pred before stack squeeze', y_pred)
        y_pred = y_pred.stack().squeeze()  # wide to long

        # Add model_2 predictions to model_1 predictions
#         display('y_pred after stack squeeze', y_pred)
#         display('model 2 predict', np.reshape(self.model_2.predict(X_2), (-1,33)))
#         y_pred += np.reshape(self.model_2.predict(X_2), (-1, 33))
#         display(y_pred.unstack())
        return y_pred.unstack()

In [23]:
model = BoostedHybrid(
    model_1=Ridge(),
    model_2=KNeighborsRegressor(),
)

model.fit(X_1, X_2, y)
y_pred = model.predict(X_1, X_2)

y_pred = y_pred.clip(0.0)


Ill-conditioned matrix (rcond=1.86813e-25): result may not be accurate.


Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.



In [24]:
y_train, y_valid = y[:"2017-07-01"], y["2017-07-02":]
X1_train, X1_valid = X_1[: "2017-07-01"], X_1["2017-07-02" :]
X2_train, X2_valid = X_2.loc[:"2017-07-01"], X_2.loc["2017-07-02":]

# model.fit(X1_train, X2_train, y_train)
y_fit = model.predict(X1_train, X2_train).clip(0.0)
y_pred = model.predict(X1_valid, X2_valid).clip(0.0)

families = y.columns[0:6]

# display(y.loc(axis=1)[families])
# display(y_fit.loc(axis=1)[families])
# display(y_pred.loc(axis=1)[families])

In [25]:
total_training = 0
total_validation = 0
for fam in families:
    fam_string = " ".join(fam)
    famplot = go.Figure()
    x_train_val = y.loc(axis=1)[families].index.astype('str')
    y_train_val = y.loc(axis=1)[families][fam].values
#     print(f'x_train_val : {x_train_val} \n y_train_val : {y_train_val} \n fam : {fam_string}')
    famplot.add_trace(go.Scatter(x=x_train_val,y=y_train_val, name=fam_string+"-sales", mode='lines+markers'))
    
    x_fit_val = y_fit.loc(axis=1)[families].index.astype('str')
    y_fit_val = y_fit.loc(axis=1)[families][fam].values
    famplot.add_trace(go.Scatter(x=x_fit_val,y=y_fit_val, name=fam_string+"-fit"))
    
    x_predict = y_pred.loc(axis=1)[families].index.astype('str')
    y_predict = y_pred.loc(axis=1)[families][fam].values
    famplot.add_trace(go.Scatter(x=x_predict,y=y_predict, name=fam_string+"-predict"))
    famplot.update_layout(title=fam_string)
    famplot.show()

    rmsle_train = mean_squared_log_error(y_train_val[:len(y_fit_val)], y_fit_val) ** 0.5
    rmsle_valid = mean_squared_log_error(y_train_val[:len(y_predict)], y_predict) ** 0.5
    print(f'Training RMSLE for {fam} : {rmsle_train:.5f}')
    print(f'Validation RMSLE for {fam} : {rmsle_valid:.5f}')
    total_training += rmsle_train
    total_validation += rmsle_valid
    
print(f'Total _training : {total_training}')
print(f'Total _validation : {total_validation}')

Training RMSLE for ('AUTOMOTIVE', '1') : 0.53189
Validation RMSLE for ('AUTOMOTIVE', '1') : 0.62859


Training RMSLE for ('BABY CARE', '1') : 0.00000
Validation RMSLE for ('BABY CARE', '1') : 0.00000


Training RMSLE for ('BEAUTY', '1') : 0.57940
Validation RMSLE for ('BEAUTY', '1') : 0.66973


Training RMSLE for ('BEVERAGES', '1') : 0.58261
Validation RMSLE for ('BEVERAGES', '1') : 1.08590


Training RMSLE for ('BOOKS', '1') : 0.41631
Validation RMSLE for ('BOOKS', '1') : 0.58575


Training RMSLE for ('BREAD/BAKERY', '1') : 0.45650
Validation RMSLE for ('BREAD/BAKERY', '1') : 0.82219
Total _training : 2.566712631403541
Total _validation : 3.792157621024776


# Forecasting

In [26]:
df_test = pd.read_csv(
    '/kaggle/input/store-sales-time-series-forecasting/test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()
# df_test

In [27]:
y = df_test.unstack(['store_nbr', 'family']).loc["2017"]
# print(y)
# Create training data
# fourier = CalendarFourier(freq='M', order=4)
# dp = DeterministicProcess(
#     index=y.index,
#     constant=True,
#     order=5,
#     seasonal=True,
#     additional_terms=[fourier],
#     drop=True,
# )
# X = dp.in_sample()

fourier = CalendarFourier(freq='D', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=5,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)

X_1 = dp.in_sample()
# display(X_1)
# X['NewYear'] = (X.index.dayofyear == 1)


In [28]:
# X_1: Features for Linear Regression
X_1 = dp.in_sample()

# X_2: Features for XGBoost
X_2 = df_test  # onpromotion feature

# Label encoding for 'family'
le = LabelEncoder()  # from sklearn.preprocessing
X_2 = X_2.reset_index('family')

X_2['family'] = le.fit_transform(X_2['family'])
X_2.reset_index(inplace=True)
# display(X_2['date'].dt.day)
# Label encoding for seasonality
X_2["day"] = X_2['date'].dt.day  # values are day of the month
X_2 = X_2.groupby(['date', 'family', 'store_nbr']).sum()
X_2 = X_2.reset_index().set_index('date')[['family', 'onpromotion', 'day', 'store_nbr']]
# display(X_1)
# display(X_2)

model.predict(X_1, X_2)

family,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,...,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD
store_nbr,1,10,11,12,13,14,15,16,17,18,...,5,50,51,52,53,54,6,7,8,9
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-08-16,1.535418,2.602253,10.434554,5.743122,8.620365,7.216562,1.792163,4.904678,12.720517,5.409045,...,12.630771,30.829883,54.016178,-0.886168,8.04147,1.194861,64.890784,42.073772,46.886162,14.157973
2017-08-17,3.09933,1.757104,7.475052,4.460642,8.264202,6.499988,1.720009,3.474977,8.890422,2.654078,...,11.212231,25.024981,48.514877,-1.902612,6.719588,1.15954,47.490021,51.129697,31.694721,9.396126
2017-08-18,2.930088,2.092022,7.029349,3.188239,7.008364,5.73074,1.288599,3.492825,7.694098,2.699218,...,10.853266,22.308175,38.517302,-0.477998,5.796196,1.432594,42.351573,46.845659,31.446087,9.810805
2017-08-19,3.184904,1.96783,5.606031,4.040648,6.996726,5.703669,0.94405,2.886415,7.507629,2.997691,...,10.485752,22.185406,47.43684,-1.919537,6.464994,2.019717,43.971583,55.607196,40.897146,8.556791
2017-08-20,3.287869,2.062635,5.493126,3.973816,7.163802,5.504482,0.697975,2.722593,6.753247,2.697783,...,14.208123,20.123512,33.752911,-1.040412,6.21801,0.813173,34.965509,41.722161,31.5678,7.892806
2017-08-21,4.545229,2.032548,6.945561,3.574426,7.211442,5.465915,1.446135,3.650779,6.374494,2.800761,...,12.650489,26.247207,73.552353,-0.085424,9.564994,-0.180472,63.131903,68.309062,67.106837,10.805498
2017-08-22,3.775074,3.392823,8.811183,7.842392,10.139596,8.375576,3.61291,5.458774,14.855823,5.306357,...,10.488935,25.768183,49.141494,1.699571,8.998618,1.220572,58.249804,39.787087,54.904812,19.530164
2017-08-23,1.44472,3.042403,8.976419,6.188671,8.212646,6.343531,2.719912,5.275664,12.056386,4.773449,...,10.945104,25.304348,51.750149,0.762229,8.579347,1.478545,66.171664,42.789961,50.748063,17.005756
2017-08-24,3.027636,2.171005,6.15907,4.911554,7.871668,5.690047,2.605262,3.851022,8.309609,2.08031,...,9.660572,19.824908,46.433362,-0.419279,7.210631,1.444217,48.673503,51.806825,35.360974,12.07884
2017-08-25,2.876501,2.480682,5.849862,3.643891,6.630639,4.981303,2.132655,3.87323,7.193047,2.184802,...,9.430668,17.42313,36.613747,0.848137,6.2423,1.717878,43.441658,47.485156,34.923168,12.33487


In [29]:
# model = MultiOutputRegressor(XGBRegressor())
# model.fit(X, y)
# y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

# model = LinearRegression(fit_intercept=False)
# model.fit(X, y)
# y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

In [30]:
# temp = df_test.reset_index()
# prediction = model.predict(X_1, X_2).stack().reset_index().set_axis(['date', 'family', 'sales'], axis=1)

# # display(prediction)
# # display(temp)

# results = temp.merge(prediction, on=['date', 'family'])
# results = results.reindex(columns=['id', 'sales'])
# results.to_csv('submission.csv', index=False)

In [31]:
# df_test = pd.read_csv(
#     '/kaggle/input/store-sales-time-series-forecasting/test.csv',
#     dtype={
#         'store_nbr': 'category',
#         'family': 'category',
#         'onpromotion': 'uint32',
#     },
#     parse_dates=['date'],
#     infer_datetime_format=True,
# )
# df_test['date'] = df_test.date.dt.to_period('D')
# df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

# # Create features for test set
# X_test = dp.out_of_sample(steps=16)
# X_test.index.name = 'date'
# X_test['NewYear'] = (X_test.index.dayofyear == 1)

y_submit = pd.DataFrame(model.predict(X_1, X_2), index=X_1.index)
y_submit = y_submit.stack(['store_nbr', 'family'])

In [32]:
y_submit.reset_index()
df_test.reset_index()
results = pd.merge(y_submit.reset_index(), df_test.reset_index(), on=['store_nbr', 'family', 'date'])[['id', 0]]
results = results.set_axis(['id', 'sales'], axis=1)
results.to_csv('submission.csv', index=False)