In [22]:
#! pip install prophet
#! pip install pystan==2.19.1.1
#! pip install prophet --no-cache-dir --force-reinstall
import pandas as pd
import numpy as np
import warnings
from prophet import Prophet
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
warnings.filterwarnings("ignore")

In [23]:
df = pd.read_csv('../data/cleaned_data.csv', parse_dates=['date'])
df.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,holiday_type,locale,transferred,dcoilwtico,city,state,store_type,cluster,transactions,year,month,week,quarter,day_of_week,is_crisis,sales_lag_7,rolling_mean_7,is_weekend,is_holiday,promo_last_7_days,days_to_holiday,promotion_status
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Holiday,National,False,93.14,Quito,Pichincha,D,13,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
1,2013-01-01,1,BABY CARE,0.0,0,Holiday,National,False,93.14,Quito,Pichincha,D,13,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
2,2013-01-01,1,BEAUTY,0.0,0,Holiday,National,False,93.14,Quito,Pichincha,D,13,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
3,2013-01-01,1,BEVERAGES,0.0,0,Holiday,National,False,93.14,Quito,Pichincha,D,13,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
4,2013-01-01,1,BOOKS,0.0,0,Holiday,National,False,93.14,Quito,Pichincha,D,13,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion


In [24]:
feature_cols = [
    'onpromotion', 'transactions', 'dcoilwtico',
    'sales_lag_7', 'rolling_mean_7',
    'is_holiday', 'is_weekend', 'promo_last_7_days',
    'days_to_holiday', 'year', 'month', 'week', 'day_of_week'
]

target_col = 'sales'

In [25]:
categorical_cols = [
    'holiday_type', 'locale', 'transferred', 'city', 'state',
    'store_type', 'promotion_status', 'day_of_week'
]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

In [26]:
scaler = MinMaxScaler()
df = scaler.fit_transform(df[feature_cols + [target_col]])

In [27]:
df = df.sort_values('date')
train = df[df['date'] < '2017-01-01']
test = df[df['date'] >= '2017-01-01']
train.head()

AttributeError: 'numpy.ndarray' object has no attribute 'sort_values'

In [5]:
df_prophet = df.rename(columns={'date': 'ds', 'sales': 'y'})
df_prophet['ds'] = pd.to_datetime(df_prophet['ds'])
df_prophet['onpromotion'] = df_prophet['onpromotion'].astype(float)
df_prophet['is_holiday'] = df_prophet['is_holiday'].astype(float)
df_prophet['sales_lag_7'] = df_prophet['sales_lag_7'].astype(float)
df_prophet['rolling_mean_7'] = df_prophet['rolling_mean_7'].astype(float)

In [6]:
train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,holiday_type,locale,transferred,dcoilwtico,city,state,store_type,cluster,transactions,year,month,week,quarter,day_of_week,is_crisis,sales_lag_7,rolling_mean_7,is_weekend,is_holiday,promo_last_7_days,days_to_holiday,promotion_status
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Holiday,National,False,93.14,Quito,Pichincha,D,13,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
1194,2013-01-01,42,CELEBRATION,0.0,0,Holiday,National,False,93.14,Cuenca,Azuay,D,2,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
1193,2013-01-01,42,BREAD/BAKERY,0.0,0,Holiday,National,False,93.14,Cuenca,Azuay,D,2,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
1192,2013-01-01,42,BOOKS,0.0,0,Holiday,National,False,93.14,Cuenca,Azuay,D,2,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion
1191,2013-01-01,42,BEVERAGES,0.0,0,Holiday,National,False,93.14,Cuenca,Azuay,D,2,0.0,2013,1,1,1,Tuesday,0,0.0,0.0,0,1,0.0,0,Not On Promotion


In [7]:
train = df_prophet[df_prophet['ds'] < '2017-01-01']
test = df_prophet[df_prophet['ds'] >= '2017-01-01']
test.head()

Unnamed: 0,ds,store_nbr,family,y,onpromotion,holiday_type,locale,transferred,dcoilwtico,city,state,store_type,cluster,transactions,year,month,week,quarter,day_of_week,is_crisis,sales_lag_7,rolling_mean_7,is_weekend,is_holiday,promo_last_7_days,days_to_holiday,promotion_status
2643889,2017-01-01,41,POULTRY,0.0,0.0,Holiday,National,True,52.01,Machala,El Oro,D,4,0.0,2017,1,52,1,Sunday,0,498.33,191.32,1,1.0,16.0,1461,Not On Promotion
2643897,2017-01-01,42,BEVERAGES,0.0,0.0,Holiday,National,True,52.01,Cuenca,Azuay,D,2,0.0,2017,1,52,1,Sunday,0,3359.0,1015.18,1,1.0,76.0,1461,Not On Promotion
2643896,2017-01-01,42,BEAUTY,0.0,0.0,Holiday,National,True,52.01,Cuenca,Azuay,D,2,0.0,2017,1,52,1,Sunday,0,7.0,466.22,1,1.0,10.0,1461,Not On Promotion
2643895,2017-01-01,42,BABY CARE,0.0,0.0,Holiday,National,True,52.01,Cuenca,Azuay,D,2,0.0,2017,1,52,1,Sunday,0,0.0,491.57,1,1.0,9.0,1461,Not On Promotion
2643894,2017-01-01,42,AUTOMOTIVE,0.0,0.0,Holiday,National,True,52.01,Cuenca,Azuay,D,2,0.0,2017,1,52,1,Sunday,0,7.0,494.43,1,1.0,9.0,1461,Not On Promotion


In [8]:
m = Prophet()
m.add_regressor('sales_lag_7')
m.add_regressor('rolling_mean_7')
m.add_regressor('onpromotion')
m.add_regressor('is_holiday')


<prophet.forecaster.Prophet at 0x251a6dbe900>

In [9]:
m.fit(train)

14:42:48 - cmdstanpy - INFO - Chain [1] start processing
15:05:20 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x251a6dbe900>

In [16]:
future = test[['ds', 'sales_lag_7', 'rolling_mean_7', 'onpromotion', 'is_holiday']].sample(n=20000, random_state=42)
forecast = m.predict(future)

In [17]:
forecast.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,extra_regressors_additive,extra_regressors_additive_lower,extra_regressors_additive_upper,is_holiday,is_holiday_lower,is_holiday_upper,onpromotion,onpromotion_lower,onpromotion_upper,rolling_mean_7,rolling_mean_7_lower,rolling_mean_7_upper,sales_lag_7,sales_lag_7_lower,sales_lag_7_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2017-01-01,277.42,5385.84,6444.68,277.42,277.42,5620.18,5620.18,5620.18,5621.81,5621.81,5621.81,4.56,4.56,4.56,-8.91,-8.91,-8.91,109.93,109.93,109.93,5516.23,5516.23,5516.23,12.03,12.03,12.03,-13.65,-13.65,-13.65,0.0,0.0,0.0,5897.6
1,2017-01-01,277.42,-479.36,496.91,277.42,277.42,-273.09,-273.09,-273.09,-271.46,-271.46,-271.46,4.56,4.56,4.56,-8.91,-8.91,-8.91,31.78,31.78,31.78,-298.88,-298.88,-298.88,12.03,12.03,12.03,-13.65,-13.65,-13.65,0.0,0.0,0.0,4.33
2,2017-01-01,277.42,-539.36,455.04,277.42,277.42,-315.84,-315.84,-315.84,-314.21,-314.21,-314.21,4.56,4.56,4.56,-8.91,-8.91,-8.91,-9.19,-9.19,-9.19,-300.67,-300.67,-300.67,12.03,12.03,12.03,-13.65,-13.65,-13.65,0.0,0.0,0.0,-38.42
3,2017-01-01,277.42,14347.73,15319.03,277.42,277.42,14535.65,14535.65,14535.65,14537.28,14537.28,14537.28,4.56,4.56,4.56,-8.91,-8.91,-8.91,297.27,297.27,297.27,14244.36,14244.36,14244.36,12.03,12.03,12.03,-13.65,-13.65,-13.65,0.0,0.0,0.0,14813.06
4,2017-01-01,277.42,-409.25,611.12,277.42,277.42,-164.46,-164.46,-164.46,-162.83,-162.83,-162.83,4.56,4.56,4.56,-8.91,-8.91,-8.91,70.88,70.88,70.88,-229.35,-229.35,-229.35,12.03,12.03,12.03,-13.65,-13.65,-13.65,0.0,0.0,0.0,112.96


In [18]:
results = forecast[['ds', 'yhat']].merge(test[['ds', 'y']], on='ds', how='inner')
results.rename(columns={'y': 'actual', 'yhat': 'predicted'}, inplace=True)

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
rmse = mse ** 0.5
r2 = r2_score(results['actual'], results['predicted'])

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


MAE: 835.36
MSE: 3554898.81
RMSE: 1885.44
R² Score: -0.90
