In [124]:
# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modeling and Forecasting
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster

# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

In [125]:
sales_data = pd.read_csv('../Data/SalesData.csv')
sales_data.head()

Unnamed: 0,Date,Month,Quarter,Year,Qty,Product,Category,Pharmacy,Location
0,1-11-2015,11,4,2015,2,Injection 14,Skinbooster,Pharmacy 1,London
1,1-11-2015,11,4,2015,1,Injection 14,Skinbooster,Pharmacy 1,London
2,1-11-2015,11,4,2015,1,Injection 1,Dermafiller,Pharmacy 2,Outside London M25
3,1-11-2015,11,4,2015,1,Injection 2,Dermafiller,Pharmacy 2,Outside London M25
4,1-11-2015,11,4,2015,4,Injection 14,Skinbooster,Pharmacy 2,"NorthEast, Midlands"


In [126]:
columns_to_drop= ['Pharmacy', 'Product', 'Location']
sales_data.drop(columns=columns_to_drop, inplace=True)

In [127]:
sales_data.head()

Unnamed: 0,Date,Month,Quarter,Year,Qty,Category
0,1-11-2015,11,4,2015,2,Skinbooster
1,1-11-2015,11,4,2015,1,Skinbooster
2,1-11-2015,11,4,2015,1,Dermafiller
3,1-11-2015,11,4,2015,1,Dermafiller
4,1-11-2015,11,4,2015,4,Skinbooster


In [128]:
encoder = OneHotEncoder()
encoder.fit(sales_data[['Category']])

In [129]:
encoded_columns = encoder.transform(sales_data[['Category']]).toarray()

In [130]:
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['Category']))
df = pd.concat([sales_data, encoded_df], axis=1)
df.drop('Category', axis=1, inplace=True)

In [131]:
df

Unnamed: 0,Date,Month,Quarter,Year,Qty,Category_Dermafiller,Category_Mesotherapy,Category_Needles,Category_Profilho,Category_Skinbooster,Category_Skincare
0,1-11-2015,11,4,2015,2,0.0,0.0,0.0,0.0,1.0,0.0
1,1-11-2015,11,4,2015,1,0.0,0.0,0.0,0.0,1.0,0.0
2,1-11-2015,11,4,2015,1,1.0,0.0,0.0,0.0,0.0,0.0
3,1-11-2015,11,4,2015,1,1.0,0.0,0.0,0.0,0.0,0.0
4,1-11-2015,11,4,2015,4,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
68631,1-3-2022,3,1,2022,2,0.0,0.0,0.0,0.0,0.0,1.0
68632,1-3-2022,3,1,2022,1,0.0,0.0,0.0,0.0,0.0,1.0
68633,1-3-2022,3,1,2022,1,0.0,0.0,0.0,0.0,0.0,1.0
68634,1-3-2022,3,1,2022,1,0.0,0.0,0.0,0.0,0.0,1.0


In [132]:
#df = df.rename(columns={'fecha': 'Date'})
df['Date'] = pd.to_datetime(df.Date)
df = df.set_index('Date')
#df = df.rename(columns={'x': 'y'})
#df = df.asfreq('MS')
df = df.sort_index()
df.head()

Unnamed: 0_level_0,Month,Quarter,Year,Qty,Category_Dermafiller,Category_Mesotherapy,Category_Needles,Category_Profilho,Category_Skinbooster,Category_Skincare
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-11,11,4,2015,2,0.0,0.0,0.0,0.0,1.0,0.0
2015-01-11,11,4,2015,1,0.0,0.0,0.0,0.0,1.0,0.0
2015-01-11,11,4,2015,1,1.0,0.0,0.0,0.0,0.0,0.0
2015-01-11,11,4,2015,1,1.0,0.0,0.0,0.0,0.0,0.0
2015-01-11,11,4,2015,4,0.0,0.0,0.0,0.0,1.0,0.0


In [133]:
print(f'Number of rows with missing values: {df.isnull().any(axis=1).mean()}')

Number of rows with missing values: 0.0


In [134]:
# # Verify that a temporary index is complete
# # ==============================================================================
# (df.index == pd.date_range(start=df.index.min(),
#                              end=df.index.max(),
#                              freq=df.index.freq)).all()