In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import math

In [2]:
data=pd.read_csv('final_data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27390 entries, 0 to 27389
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         27390 non-null  object 
 1   Week         27390 non-null  int64  
 2   ProdName     27390 non-null  object 
 3   Price        27390 non-null  float64
 4   Seasonality  27390 non-null  int64  
 5   Quantity     27390 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 1.3+ MB


In [4]:
data=data[data['Quantity']>0]

In [5]:
one_hot_encoded = pd.get_dummies(data['Seasonality'], prefix='Season').iloc[:, :-1]

data = pd.concat([data, one_hot_encoded], axis=1)

In [6]:
data.head()

Unnamed: 0,Date,Week,ProdName,Price,Seasonality,Quantity,Season_1,Season_2,Season_3,Season_4,Season_5,Season_6
0,1/2/2018,1,Basil Extra Virgin Olive Oil,25.0,2,2,False,True,False,False,False,False
1,1/3/2018,1,Basil Extra Virgin Olive Oil,24.0,3,4,False,False,True,False,False,False
2,1/4/2018,1,Basil Extra Virgin Olive Oil,26.0,4,3,False,False,False,True,False,False
3,1/5/2018,1,Basil Extra Virgin Olive Oil,23.0,5,3,False,False,False,False,True,False
4,1/6/2018,1,Basil Extra Virgin Olive Oil,28.0,6,5,False,False,False,False,False,True


In [7]:
results_df=pd.DataFrame({'ProdName': data['ProdName'].unique()}) 
for product in list(data['ProdName'].unique()):
    ts_data=data[data['ProdName']==product].reset_index(drop=True)
    ts_data.sort_values(by='Date',inplace=True)
    X=ts_data.drop(['Quantity','ProdName','Date','Seasonality'],axis=1)
    y=ts_data['Quantity']
    tscv = TimeSeriesSplit(n_splits=5)
    train_scores=np.zeros((5,4))
    val_scores=np.zeros((5,4))
    index=0
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model1 = LinearRegression()
        model1.fit(X_train, y_train)
        train_scores[index,0] = mean_absolute_error(y_train, model1.predict(X_train))
        val_scores[index,0] = mean_absolute_error(y_val, model1.predict(X_val))
        model5 = LinearRegression()
        model5.fit(X,y)
        std_dev5=np.std(np.array(y) - model5.predict(X))
        
        model2 = LinearRegression()
        model2.fit(X_train.applymap(lambda x: math.log1p(x)), y_train)
        train_scores[index,1] = mean_absolute_error(y_train, model2.predict(X_train.applymap(lambda x: math.log1p(x))))
        val_scores[index,1] = mean_absolute_error(y_val, model2.predict(X_val.applymap(lambda x: math.log1p(x))))
        model6 = LinearRegression()
        model6.fit(X.applymap(lambda x: math.log1p(x)),y)
        std_dev6=np.std(np.array(y) - model6.predict(X.applymap(lambda x: math.log1p(x))))
        
        model3 = LinearRegression()
        model3.fit(X_train, np.log1p(y_train))
        train_scores[index,2] = mean_absolute_error(y_train, np.expm1(model3.predict(X_train)))
        val_scores[index,2] = mean_absolute_error(y_val, np.expm1(model3.predict(X_val)))
        model7 = LinearRegression()
        model7.fit(X, np.log1p(y))
        std_dev7=np.std(np.array(y) - np.expm1(model7.predict(X)))
        
        model4 = LinearRegression()
        model4.fit(X_train.applymap(lambda x: math.log1p(x)), np.log1p(y_train))
        train_scores[index,3] = mean_absolute_error(y_train, np.expm1(model4.predict(X_train.applymap(lambda x: math.log1p(x)))))
        val_scores[index,3] = mean_absolute_error(y_val, np.expm1(model4.predict(X_val.applymap(lambda x: math.log1p(x)))))
        model8 = LinearRegression()
        model8.fit(X.applymap(lambda x: math.log1p(x)),np.log1p(y))
        std_dev8=np.std(np.array(y) - np.expm1(model8.predict(X.applymap(lambda x: math.log1p(x)))))
        
        index=index+1
    train_scores=np.mean(train_scores, axis=0)
    val_scores=np.mean(val_scores, axis=0)
    
    results_df.loc[results_df['ProdName'] == product, 'Val_Score'] = np.min(val_scores)
    results_df.loc[results_df['ProdName'] == product, 'Train_Score'] = np.min(train_scores)
    results_df.loc[results_df['ProdName'] == product, 'Model'] = [model5,model6,model7,model8][np.argmin(val_scores)]
    results_df.loc[results_df['ProdName'] == product, 'ModelType'] = np.argmin(val_scores)
    results_df.loc[results_df['ProdName'] == product, 'Std_Dev'] = [std_dev5,std_dev6,std_dev7,std_dev8][np.argmin(val_scores)]

In [8]:
results_df['ModelType'].value_counts()

ModelType
2.0    7
0.0    5
3.0    3
Name: count, dtype: int64

In [9]:
def sample_prices(group):
    return group.sample(n=30, replace=True)

# Group by 'ProdName' and apply the sampling function
sampled_data = data[['ProdName','Price']].groupby('ProdName').apply(sample_prices).reset_index(drop=True).sort_values(by='ProdName').drop('ProdName', axis=1)

In [10]:
X_forecast = pd.concat([pd.DataFrame({'ProdName' :data['ProdName'].unique()}), pd.Series(1, index=pd.DataFrame(data['ProdName'].unique()).index, name='Key')], axis=1)\
.merge(pd.DataFrame({'Date': pd.date_range(start='2024-04-01', end='2024-04-30', freq='D'), 'Key': 1}),\
      how='inner' , on='Key')

In [11]:
X_forecast.drop('Key',axis=1, inplace=True)
X_forecast['Seasonality']=X_forecast['Date'].dt.dayofweek+1

In [12]:
X_forecast['Week'] = (X_forecast['Date'].dt.day - 1) // 7 + 327
X_forecast.sort_values(by=['ProdName','Date'])

Unnamed: 0,ProdName,Date,Seasonality,Week
0,Basil Extra Virgin Olive Oil,2024-04-01,1,327
1,Basil Extra Virgin Olive Oil,2024-04-02,2,327
2,Basil Extra Virgin Olive Oil,2024-04-03,3,327
3,Basil Extra Virgin Olive Oil,2024-04-04,4,327
4,Basil Extra Virgin Olive Oil,2024-04-05,5,327
...,...,...,...,...
415,Vanilla and Olive Oil Candle,2024-04-26,5,330
416,Vanilla and Olive Oil Candle,2024-04-27,6,330
417,Vanilla and Olive Oil Candle,2024-04-28,7,330
418,Vanilla and Olive Oil Candle,2024-04-29,1,331


In [13]:
X_forecast = pd.concat([X_forecast, sampled_data], axis=1)

In [14]:
one_hot_encoded = pd.get_dummies(X_forecast['Seasonality'], prefix='Season').iloc[:, :-1]

X_data = pd.concat([X_forecast, one_hot_encoded], axis=1)

In [15]:
for product in list(X_data['ProdName'].unique()):
    ts_data=X_data[X_data['ProdName']==product].reset_index(drop=True)
    ts_data.sort_values(by='Date',inplace=True)
    X=ts_data.drop(['ProdName','Date','Seasonality'],axis=1)
    model = results_df[results_df['ProdName']==product]['Model'].iloc[0]
    
    if results_df[results_df['ProdName']==product]['ModelType'].iloc[0]==0:
        X_forecast.loc[X_forecast['ProdName'] == product, 'Prediction'] = model.predict(X)
    elif results_df[results_df['ProdName']==product]['ModelType'].iloc[0]==1:
        X_forecast.loc[X_forecast['ProdName'] == product, 'Prediction'] = model.predict(X.applymap(lambda x: math.log1p(x)))
    elif results_df[results_df['ProdName']==product]['ModelType'].iloc[0]==2:
        X_forecast.loc[X_forecast['ProdName'] == product, 'Prediction'] = np.expm1(model.predict(X))
    else:        
        X_forecast.loc[X_forecast['ProdName'] == product, 'Prediction'] = np.expm1(model.predict(X.applymap(lambda x: math.log1p(x))))
        
# X_forecast['Prediction'] = X_forecast['Prediction'].round().astype(int)

In [16]:
X_forecast.merge(results_df[['ProdName','Std_Dev']],how='inner',on='ProdName').to_csv('forecast_data.csv')

In [17]:
results_df[results_df['ProdName']=='Basil Extra Virgin Olive Oil']['Model'].iloc[0].coef_

array([ 0.02987332, -0.32766966, -4.82134105, -3.96138499, -3.13249641,
       -2.5771183 , -1.20923494,  1.90462413])

In [18]:
results_df[results_df['ProdName']=='Basil Extra Virgin Olive Oil']['Model'].iloc[0].intercept_

14.28093095975656