# Model Training and Selection

This notebook furthur processes the data to match specific requirements of difrent models.Various models are explored and evaluated and feature engineering is also carried out on the clean dataset

In [1]:
#Import the neccesary libraries
import warnings
warnings.filterwarnings('ignore')
import random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gr
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

In [2]:
df = pd.read_csv("processed.csv", parse_dates=['date'], index_col='date')
df =df.asfreq('D')
df =df[["quantity"]]
df.head()

In [3]:
print("Merged max :",  df.index.max())
print("Merged min :",  df.index.min())

## Model 1 : SARIMAX

In [3]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

## Model 1 : SARIMAX
# Split data into train and test sets
train_size = int(len(df) * 0.8)
train, test = df.iloc[:train_size], df.iloc[train_size:]

# Define and fit the SARIMAX model
model = SARIMAX(train['quantity'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
fit_model = model.fit(disp=False)

# Forecast
n_forecast = len(test)
forecast = fit_model.get_forecast(steps=n_forecast)
forecast_index = test.index
forecast_values = forecast.predicted_mean

# Calculate MAE and MAPE
mae = mean_absolute_error(test['quantity'], forecast_values)
mape = mean_absolute_percentage_error(test['quantity'], forecast_values)

# Print the results
print(f'MAE: {mae}')
print(f'MAPE: {mape}')

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(train.index, train['quantity'], label='Train')
plt.plot(test.index, test['quantity'], label='Test', color='green')
plt.plot(forecast_index, forecast_values, label='Forecast', color='red')
plt.xlabel('Date')
plt.ylabel('Quantity')
plt.title('SARIMAX Forecast')
plt.legend()
plt.show()

## Model 2 and 3 : ARIMA & SES

In [5]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import SimpleExpSmoothing


# ARIMA Model
def fit_arima(train_data, order):
    model = ARIMA(train_data, order=order)
    fit_model = model.fit()
    return fit_model

# Exponential Smoothing (Simple Exponential Smoothing)
def fit_exponential_smoothing(train_data):
    model = SimpleExpSmoothing(train_data)
    fit_model = model.fit()
    return fit_model

# Forecasting and Evaluation Function
def forecast_and_evaluate(model, test_data):
    forecast_values = model.forecast(len(test_data))
    mae = mean_absolute_error(test_data, forecast_values)
    mape = mean_absolute_percentage_error(test_data, forecast_values)
    return forecast_values, mae, mape

# Fit ARIMA model
arima_model = fit_arima(train['quantity'], order=(1, 1, 1))

# Fit Exponential Smoothing model
exp_smoothing_model = fit_exponential_smoothing(train['quantity'])

# Forecast and evaluate ARIMA
arima_forecast, arima_mae, arima_mape = forecast_and_evaluate(arima_model, test['quantity'])

# Forecast and evaluate Exponential Smoothing
exp_smoothing_forecast, exp_smoothing_mae, exp_smoothing_mape = forecast_and_evaluate(exp_smoothing_model, test['quantity'])

# Print results
print(f'ARIMA MAE: {arima_mae}, ARIMA MAPE: {arima_mape}')
print(f'Exponential Smoothing MAE: {exp_smoothing_mae}, Exponential Smoothing MAPE: {exp_smoothing_mape}')

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(train.index, train['quantity'], label='Train')
plt.plot(test.index, test['quantity'], label='Test', color='green')

# Plot ARIMA forecast
plt.plot(test.index, arima_forecast, label='ARIMA Forecast', color='blue')

# Plot Exponential Smoothing forecast
plt.plot(test.index, exp_smoothing_forecast, label='Exponential Smoothing Forecast', color='red')

plt.xlabel('Date')
plt.ylabel('Quantity')
plt.title('ARIMA vs Exponential Smoothing Forecast Comparison')
plt.legend()
plt.show();

In [15]:



# Fine-tuning SARIMAX model
def fine_tune_sarimax(train_data):
    best_mae = float('inf')
    best_mape = float('inf')
    best_order = None
    best_seasonal_order = None
    
    # Iterate over possible parameter combinations
    for p in range(3):
        for d in range(2):
            for q in range(3):
                for P in range(2):
                    for D in range(2):
                        for Q in range(2):
                            seasonal_order = (P, D, Q, 12)
                            try:
                                model = SARIMAX(train_data, order=(p, d, q), seasonal_order=seasonal_order)
                                fit_model = model.fit(disp=False)
                                forecast_values = fit_model.forecast(len(test))
                                mae = mean_absolute_error(test['quantity'], forecast_values)
                                mape = mean_absolute_percentage_error(test['quantity'], forecast_values)
                                
                                # Update best parameters if current model is better
                                if mae < best_mae:
                                    best_mae = mae
                                    best_mape = mape
                                    best_order = (p, d, q)
                                    best_seasonal_order = seasonal_order
                            except:
                                continue
    return best_order, best_seasonal_order, best_mae, best_mape

# Fine-tuning ARIMA model
def fine_tune_arima(train_data):
    best_mae = float('inf')
    best_mape = float('inf')
    best_order = None
    
    # Iterate over possible parameter combinations
    for p in range(3):
        for d in range(2):
            for q in range(3):
                try:
                    model = ARIMA(train_data, order=(p, d, q))
                    fit_model = model.fit()
                    forecast_values = fit_model.forecast(len(test))
                    mae = mean_absolute_error(test['quantity'], forecast_values)
                    mape = mean_absolute_percentage_error(test['quantity'], forecast_values)
                    
                    # Update best parameters if current model is better
                    if mae < best_mae:
                        best_mae = mae
                        best_mape = mape
                        best_order = (p, d, q)
                except:
                    continue
    return best_order, best_mae, best_mape

# Fine-tuning Exponential Smoothing model
def fine_tune_exponential_smoothing(train_data):
    best_mae = float('inf')
    best_mape = float('inf')
    best_alpha = None
    
    # Iterate over possible smoothing levels
    for alpha in np.arange(0.1, 1.1, 0.1):
        try:
            model = SimpleExpSmoothing(train_data)
            fit_model = model.fit(smoothing_level=alpha)
            forecast_values = fit_model.forecast(len(test))
            mae = mean_absolute_error(test['quantity'], forecast_values)
            mape = mean_absolute_percentage_error(test['quantity'], forecast_values)
            
            # Update best parameters if current model is better
            if mae < best_mae:
                best_mae = mae
                best_mape = mape
                best_alpha = alpha
        except:
            continue
    return best_alpha, best_mae, best_mape

# Perform fine-tuning and get results for SARIMAX
sarimax_order, sarimax_seasonal_order, sarimax_mae, sarimax_mape = fine_tune_sarimax(train['quantity'])

# Perform fine-tuning and get results for ARIMA
arima_order, arima_mae, arima_mape = fine_tune_arima(train['quantity'])

# Perform fine-tuning and get results for Exponential Smoothing
exp_smoothing_alpha, exp_smoothing_mae, exp_smoothing_mape = fine_tune_exponential_smoothing(train['quantity'])

# Print results
results = [
    ['SARIMAX', sarimax_order, sarimax_seasonal_order, sarimax_mae, sarimax_mape],
    ['ARIMA', arima_order, None, arima_mae, arima_mape],
    ['Exponential Smoothing', exp_smoothing_alpha, None, exp_smoothing_mae, exp_smoothing_mape]
]

headers = ['Model', 'Order', 'Seasonal Order/Smoothing Level', 'MAE', 'MAPE']
print(tabulate(results, headers=headers))

In [6]:
data = df.copy()
data["unique_id"]=1.0
data["ds"] = data.index
data.rename(columns={"quantity":"y"},inplace=True)
data.head()

In [7]:
# Using Nxitla libraries
#Data Split
# Calculate the index for the split
split_index = int(0.8 * len(data))

# Split the data
Y_train_df = data.iloc[:split_index]
Y_test_df = data.iloc[split_index:]   # Test data for January 2012

horizon = len(Y_test_df)

## Model 4 : Prophet


In [None]:
import itertools
from sklearn.metrics import mean_absolute_percentage_error
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
import holidays

# Add is_public_holiday column
holiday = holidays.CountryHoliday('UK')
data['is_public_holiday'] = data['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

# Define hyperparameters to tune
param_grid = {  
    'changepoint_prior_scale': [0.01, 0.1, 0.5],
    'seasonality_prior_scale': [0.1, 1.0, 10.0],
    'holidays_prior_scale': [0.1, 1.0, 10.0],
    'seasonality_mode': ['additive', 'multiplicative']
}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

best_params = None
best_mape = float('inf')

# Grid search to find the best hyperparameters
for params in all_params:
    model = Prophet(**params)
    model.add_regressor('is_public_holiday')
    model.fit(data)

    # Cross-validate the model
    df_cv = cross_validation(model, initial='547 days', period='180 days', horizon='30 days')
    df_p = performance_metrics(df_cv)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(df_cv['y'], df_cv['yhat'])
    
    if mape < best_mape:
        best_mape = mape
        best_params = params

# Output the best parameters and MAPE
print(f"Best Parameters: {best_params}")
print(f"Best MAPE: {round(best_mape, 2)}")

# Train the final model with the best parameters
model = Prophet(**best_params)
model.add_regressor('is_public_holiday')
model.fit(data)

# Forecast future values
future = model.make_future_dataframe(periods=365)
future['is_public_holiday'] = future['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

forecast = model.predict(future)

# Visualize Results
model.plot(forecast)
model.plot_components(forecast)

# Evaluate Accuracy
df_cv = cross_validation(model, initial='547 days', period='180 days', horizon='30 days')
df_p = performance_metrics(df_cv)
print(df_p.head().round(2))

fig = plot_cross_validation_metric(df_cv, metric='rmse')

# Calculate MAPE using yhat and y
mape = mean_absolute_percentage_error(df_cv['y'], df_cv['yhat'])
print(f"MAPE: {round(mape, 2)}")


increase forecast horizon to 60 days

In [8]:
import itertools
from sklearn.metrics import mean_absolute_percentage_error
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
import holidays

# Add is_public_holiday column
holiday = holidays.CountryHoliday('UK')
data['is_public_holiday'] = data['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

# Define hyperparameters to tune
param_grid = {  
    'changepoint_prior_scale': [0.01, 0.1, 0.5],
    'seasonality_prior_scale': [0.1, 1.0, 10.0],
    'holidays_prior_scale': [0.1, 1.0, 10.0],
    'seasonality_mode': ['additive', 'multiplicative']
}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

best_params = None
best_mape = float('inf')

# Grid search to find the best hyperparameters
for params in all_params:
    model = Prophet(**params)
    model.add_regressor('is_public_holiday')
    model.fit(data)

    # Cross-validate the model
    df_cv = cross_validation(model, initial='547 days', period='180 days', horizon='30 days')
    df_p = performance_metrics(df_cv)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(df_cv['y'], df_cv['yhat'])
    
    if mape < best_mape:
        best_mape = mape
        best_params = params

# Output the best parameters and MAPE
print(f"Best Parameters: {best_params}")
print(f"Best MAPE: {round(best_mape, 2)}")

# Train the final model with the best parameters
model = Prophet(**best_params)
model.add_regressor('is_public_holiday')
model.fit(data)

# Forecast future values
future = model.make_future_dataframe(periods=365)
future['is_public_holiday'] = future['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

forecast = model.predict(future)

# Visualize Results
model.plot(forecast)
model.plot_components(forecast)

# Evaluate Accuracy
df_cv = cross_validation(model, initial='547 days', period='180 days', horizon='60 days')
df_p = performance_metrics(df_cv)
print(df_p.head().round(2))

fig = plot_cross_validation_metric(df_cv, metric='rmse')

# Calculate MAPE using yhat and y
mape = mean_absolute_percentage_error(df_cv['y'], df_cv['yhat'])
print(f"MAPE: {round(mape, 2)}")


In [None]:
#Using the best parameters

# Add is_public_holiday column
holiday = holidays.CountryHoliday('UK')
data['is_public_holiday'] = data['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

# 2. Create and Fit Prophet Model with Best Parameters
best_params = {'changepoint_prior_scale': 0.01, 'seasonality_prior_scale': 10.0, 'holidays_prior_scale': 0.1, 'seasonality_mode': 'additive'}

model = Prophet(**best_params)
model.add_regressor('is_public_holiday')
model.fit(data)

# 3. Forecast Future Values
future = model.make_future_dataframe(periods=365)
future['is_public_holiday'] = future['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

forecast = model.predict(future)

# 4. Visualize Results
model.plot(forecast)
model.plot_components(forecast)

# 5. Evaluate Accuracy
df_cv = cross_validation(model, initial='547 days', period='180 days', horizon='90 days')
df_p = performance_metrics(df_cv)
print(df_p.head().round(2))

fig = plot_cross_validation_metric(df_cv, metric='rmse')

# Calculate MAPE using yhat and y
mape = mean_absolute_percentage_error(df_cv['y'], df_cv['yhat'])
print(f"MAPE: {round(mape, 2)}")

# 6. Plot Actual vs Predicted Values
plt.figure(figsize=(10, 6))
plt.plot(data['ds'], data['y'], label='Actual')
plt.plot(forecast['ds'], forecast['yhat'], label='Predicted')
plt.xlabel('Date')
plt.ylabel('Quantity Winsorized')
plt.legend()
plt.title('Actual vs Predicted Values')
plt.show()


In [22]:
from ray import tune
from neuralforecast import NeuralForecast
from neuralforecast.auto import AutoNHITS, AutoLSTM
from neuralforecast.losses.pytorch import MQLoss
from datasetsforecast.losses import mape
from datasetsforecast.evaluation import accuracy
from statsforecast import StatsForecast


# Step 3: Define model configurations
config_nhits = {
    "input_size": tune.choice([48, 48*2, 48*3]),
    "start_padding_enabled": True,
    "n_blocks": 5 * [1],
    "mlp_units": 5 * [[64, 64]],
    "n_pool_kernel_size": tune.choice([5 * [1], 5 * [2], 5 * [4], [8, 4, 2, 1, 1]]),
    "n_freq_downsample": tune.choice([[8, 4, 2, 1, 1], [1, 1, 1, 1, 1]]),
    "learning_rate": tune.loguniform(1e-4, 1e-2),
    "scaler_type": tune.choice([None]),
    "max_steps": tune.choice([1000]),
    "batch_size": tune.choice([1, 4, 10]),
    "windows_batch_size": tune.choice([128, 256, 512]),
    "random_seed": tune.randint(1, 20),
}

config_lstm = {
    "input_size": tune.choice([48, 48 * 2, 48 * 3]),
    "encoder_hidden_size": tune.choice([64, 128]),
    "encoder_n_layers": tune.choice([2, 4]),
    "learning_rate": tune.loguniform(1e-4, 1e-2),
    "scaler_type": tune.choice(['robust']),
    "max_steps": tune.choice([500, 1000]),
    "batch_size": tune.choice([1, 4]),
    "random_seed": tune.randint(1, 20),
}

# Step 4: Train models
nf = NeuralForecast(
    models=[
        AutoNHITS(h=30, config=config_nhits, loss=MQLoss(), num_samples=5),
        AutoLSTM(h=30, config=config_lstm, loss=MQLoss(), num_samples=2),
    ],
    freq='D'
)

nf.fit(df=data)

# Step 5: Predict future sales
fcst_df = nf.predict()
fcst_df.columns = fcst_df.columns.str.replace('-median', '')
print(fcst_df.head())

# Step 6: Cross-validation for model evaluation
cv_df = nf.cross_validation(data, n_windows=2)
cv_df.columns = cv_df.columns.str.replace('-median', '')

# Evaluate using MAPE
evaluation_df = accuracy(cv_df, [mape], agg_by=['unique_id'])
evaluation_df['best_model'] = evaluation_df.drop(columns=['metric', 'unique_id']).idxmin(axis=1)
print(evaluation_df)

# Select the best model based on MAPE
best_model_df = evaluation_df.query('metric == "mape"')
best_model = best_model_df.groupby('unique_id')['best_model'].first().reset_index()
print(best_model)

# Function to get best model forecast
def get_best_model_forecast(forecasts_df, evaluation_df, metric):
    df = forecasts_df.set_index('ds', append=True).stack().to_frame().reset_index(level=2) # Wide to long 
    df.columns = ['model', 'best_model_forecast'] 
    df = df.join(evaluation_df.query('metric == @metric').set_index('unique_id')[['best_model']])
    df = df.query('model.str.replace("-lo-90|-hi-90", "", regex=True) == best_model').copy()
    df.loc[:, 'model'] = [model.replace(bm, 'best_model') for model, bm in zip(df['model'], df['best_model'])]
    df = df.drop(columns='best_model').set_index('model', append=True).unstack()
    df.columns = df.columns.droplevel()
    df = df.reset_index(level=1)
    return df

# Get the best model forecasts
prod_forecasts_df = get_best_model_forecast(fcst_df, evaluation_df, metric='mape')
print(prod_forecasts_df)

# Plot the results (optional)
StatsForecast.plot(data, prod_forecasts_df, engine='matplotlib')

## Model 5 ,6 & 7 :  MLForecast

In [9]:
import pandas as pd
import numpy as np
from mlforecast import MLForecast
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = (data.index.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data

ml_data = data.copy()
ml_data  = create_date_features(ml_data )


# Define Features and Models
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': XGBRegressor(objective='reg:squarederror', n_estimators=100),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100)
}

# Specify the features to be used in the model
date_features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
                 'day_of_year', 'week_of_year', 'day_of_week', 'year',
                 'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
                 'sin_day', 'cos_day']

# Define the Forecasting Pipeline
forecast = MLForecast(
    models=models,
    freq='D', 
    lags=[7,14,30],  # Using 1, 2, 3 days lagged features
)

# Train the Models
forecast.fit(ml_data , id_col='unique_id', time_col='ds', target_col='y')

# Evaluate the Models
# We'll use the last 30 days as the test set for evaluation
train_df = ml_data [:-30]
test_df = ml_data [-30:]
forecast.fit(train_df, id_col='unique_id', time_col='ds', target_col='y')

# Generate predictions for the test set
predictions = forecast.predict(30)

# Calculate Mean Absolute Percentage Error (MAPE) for each model
mape_scores = {
    model_name: mean_absolute_percentage_error(test_df['y'], predictions[model_name])
    for model_name in models.keys()
}

# Print MAPE scores
for model_name, mape_score in mape_scores.items():
    print(f"{model_name} MAPE: {mape_score}")

# Select the Best Model
best_model_name = min(mape_scores, key=mape_scores.get)
print(f"The best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]}")

# Retrain the best model on the entire dataset
forecast.models = {best_model_name: models[best_model_name]}
forecast.fit(ml_data , id_col='unique_id', time_col='ds', target_col='y')

# Predict the next 30 days
future_predictions = forecast.predict(30)

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')
for model_name in models.keys():
    plt.plot(test_df['ds'], predictions[model_name], label=f'Predicted - {model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


## NeuralForecast

In [10]:
import pandas as pd
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, NBEATS, LSTM
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Define the models and their respective parameters
nhits_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

nbeats_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

lstm_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

# Initialize the models
nhits_model = NHITS(**nhits_params)
nbeats_model = NBEATS(**nbeats_params)
lstm_model = LSTM(**lstm_params)

# Train each model on the training data and evaluate on test data
models = [nhits_model, nbeats_model, lstm_model]
model_names = ['NHITS', 'NBEATS', 'LSTM']
mape_scores = []

for model, name in zip(models, model_names):
    nf = NeuralForecast(models=[model], freq='D')
    nf.fit(train_df)
    forecasts = nf.predict(futr_df=test_df)
    print(forecasts.head())
    mape = mean_absolute_percentage_error(test_df['y'], forecasts[name])
    mape_scores.append((name, mape))

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = None

if best_model_name == 'NHITS':
    best_model = NHITS(**nhits_params)
elif best_model_name == 'NBEATS':
    best_model = NBEATS(**nbeats_params)
elif best_model_name == 'LSTM':
    best_model = LSTM(**lstm_params)

# Train the best model on the entire dataset
nf_best_model = NeuralForecast(models=[best_model], freq='D')
nf_best_model.fit(ml_data)

# Predict the next 30 days with the best model
final_forecasts = nf_best_model.predict()

# Plot actual vs predicted values for the best model
plt.figure(figsize=(14, 7))

# Plot the test data
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')

# Plot the forecasts
forecast_dates = pd.date_range(test_df['ds'].max() + pd.to_timedelta('1 days'), periods=30, freq='D')
plt.plot(
    forecast_dates,
    final_forecasts[best_model_name][:30],  # Ensure we take only 30 predictions
    label=f'Predicted - {best_model_name}',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")


In [11]:
import pandas as pd
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, NBEATS, LSTM
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Define the models and their respective parameters
nhits_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

nbeats_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

lstm_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

# Initialize the models
nhits_model = NHITS(**nhits_params)
nbeats_model = NBEATS(**nbeats_params)
lstm_model = LSTM(**lstm_params)

# Train each model on the training data and evaluate on test data
models = [nhits_model, nbeats_model, lstm_model]
model_names = ['NHITS', 'NBEATS', 'LSTM']
mape_scores = []

for model, name in zip(models, model_names):
    nf = NeuralForecast(models=[model], freq='D')
    nf.fit(train_df)
    forecasts = nf.predict(futr_df=test_df)
    print(forecasts.head())
    mape = mean_absolute_percentage_error(test_df['y'], forecasts[name])
    mape_scores.append((name, mape))

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = None

if best_model_name == 'NHITS':
    best_model = NHITS(**nhits_params)
elif best_model_name == 'NBEATS':
    best_model = NBEATS(**nbeats_params)
elif best_model_name == 'LSTM':
    best_model = LSTM(**lstm_params)

# Train the best model on the entire dataset
nf_best_model = NeuralForecast(models=[best_model], freq='D')
nf_best_model.fit(train_df)

# Predict the entire historical period
historical_forecasts = nf_best_model.predict(futr_df=test_df)
historical_forecasts.index = test_df['ds']  # Set the index to the dates for plotting

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")

# Plot actual vs predicted values for the historical period
plt.figure(figsize=(14, 7))

# Plot the entire dataset
plt.plot(ml_data['ds'], ml_data['y'], label='Actual', marker='o')

# Plot the historical forecasts
plt.plot(
    historical_forecasts.index,
    historical_forecasts[best_model_name],  # Historical forecasts
    label=f'Predicted - {best_model_name}',
    marker='x',
)



plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name} for Historical Period')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")


### Increase forecast horizon to 60

In [12]:
import pandas as pd
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, NBEATS, LSTM
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Define the models and their respective parameters
nhits_params = {
    'h': 60,
    'input_size': 30,
    'max_steps': 50
}

nbeats_params = {
    'h': 60,
    'input_size': 30,
    'max_steps': 50
}

lstm_params = {
    'h': 60,
    'input_size': 30,
    'max_steps': 50
}

# Initialize the models
nhits_model = NHITS(**nhits_params)
nbeats_model = NBEATS(**nbeats_params)
lstm_model = LSTM(**lstm_params)

# Train each model on the training data and evaluate on test data
models = [nhits_model, nbeats_model, lstm_model]
model_names = ['NHITS', 'NBEATS', 'LSTM']
mape_scores = []

for model, name in zip(models, model_names):
    nf = NeuralForecast(models=[model], freq='D')
    nf.fit(train_df)
    
    # Create future dataframe for the test set to match expected combinations
    future_test_df = nf.make_future_dataframe()
    
    # Make predictions for the test set
    forecasts = nf.predict()
    print(forecasts.head())
    
    # Align forecast with test_df to calculate MAPE
    aligned_forecasts = forecasts[name].iloc[:len(test_df)]
    
    mape = mean_absolute_percentage_error(test_df['y'], aligned_forecasts)
    mape_scores.append((name, mape))

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = None

if best_model_name == 'NHITS':
    best_model = NHITS(**nhits_params)
elif best_model_name == 'NBEATS':
    best_model = NBEATS(**nbeats_params)
elif best_model_name == 'LSTM':
    best_model = LSTM(**lstm_params)
  
    
print(f"The best model is {best_model_name} with a MAPE of {best_mape}")    

# Train the best model on the entire dataset
nf_best_model = NeuralForecast(models=[best_model], freq='D')
nf_best_model.fit(ml_data)

# Create a future dataframe for the next 60 days
future_df = nf_best_model.make_future_dataframe()

# Predict the next 60 days
future_forecasts = nf_best_model.predict()

# Combine the historical data with the forecast data for plotting
combined_df = pd.concat([ml_data, future_df], ignore_index=True)
combined_df['forecast'] = np.nan
combined_df.loc[ml_data.shape[0]:, 'forecast'] = future_forecasts[best_model_name].values

# Plot actual vs predicted values for the historical period and future forecast
plt.figure(figsize=(14, 7))

# Plot the entire dataset
plt.plot(combined_df['ds'], combined_df['y'], label='Actual', marker='o')

# Plot the future forecasts
plt.plot(
    combined_df['ds'],
    combined_df['forecast'],
    label=f'Predicted - {best_model_name} (Next 60 Days)',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")


In [13]:
import pandas as pd
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, NBEATS, LSTM
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Define the parameter grids for manual fine-tuning
nhits_param_grid = [
    {'h': 60, 'input_size': 30, 'max_steps': 50, 'learning_rate': 0.001},
    {'h': 60, 'input_size': 60, 'max_steps': 100, 'learning_rate': 0.01},
    {'h': 60, 'input_size': 90, 'max_steps': 150, 'learning_rate': 0.1},
]

nbeats_param_grid = [
    {'h': 60, 'input_size': 30, 'max_steps': 50, 'learning_rate': 0.001},
    {'h': 60, 'input_size': 60, 'max_steps': 100, 'learning_rate': 0.01},
    {'h': 60, 'input_size': 90, 'max_steps': 150, 'learning_rate': 0.1},
]

lstm_param_grid = [
    {'h': 60, 'input_size': 30, 'max_steps': 50, 'learning_rate': 0.001, 'encoder_n_layers': 2, 'encoder_hidden_size': 200},
    {'h': 60, 'input_size': 60, 'max_steps': 100, 'learning_rate': 0.01, 'encoder_n_layers': 2, 'encoder_hidden_size': 200},
    {'h': 60, 'input_size': 90, 'max_steps': 150, 'learning_rate': 0.1, 'encoder_n_layers': 3, 'encoder_hidden_size': 300},
]

# Initialize the models
# Initialize the models
nhits_model = NHITS(h=60,input_size=30)
nbeats_model = NBEATS(h=60,input_size=30)
lstm_model = LSTM(h=60,input_size=30)

# Train and evaluate each model with different hyperparameters
models = [nhits_model, nbeats_model, lstm_model]
param_grids = [nhits_param_grid, nbeats_param_grid, lstm_param_grid]
model_names = ['NHITS', 'NBEATS', 'LSTM']
mape_scores = []

for model, param_grid, name in zip(models, param_grids, model_names):
    for params in param_grid:
        if name == 'NHITS':
            model = NHITS(**params)
        elif name == 'NBEATS':
            model = NBEATS(**params)
        elif name == 'LSTM':
            model = LSTM(**params)
            
        nf = NeuralForecast(models=[model], freq='D')
        nf.fit(train_df)
        
        # Make predictions for the test set
        forecasts = nf.predict()
        print(forecasts.head())
        
        # Align forecast with test_df to calculate MAPE
        aligned_forecasts = forecasts[name].iloc[:len(test_df)]
        
        mape = mean_absolute_percentage_error(test_df['y'], aligned_forecasts)
        mape_scores.append((name, mape, params))

# Select the best model based on MAPE
best_model_name, best_mape, best_params = min(mape_scores, key=lambda x: x[1])

if best_model_name == 'NHITS':
    best_model = NHITS(**best_params)
elif best_model_name == 'NBEATS':
    best_model = NBEATS(**best_params)
elif best_model_name == 'LSTM':
    best_model = LSTM(**best_params)
    
print(f"The best model is {best_model_name} with a MAPE of {best_mape} and best parameters: {best_params}")

# Train the best model on the entire dataset
nf_best_model = NeuralForecast(models=[best_model], freq='D')
nf_best_model.fit(ml_data)

# Create a future dataframe for the next 60 days
future_df = nf_best_model.make_future_dataframe()

# Predict the next 60 days
future_forecasts = nf_best_model.predict()

# Combine the historical data with the forecast data for plotting
combined_df = pd.concat([ml_data, future_df], ignore_index=True)
combined_df['forecast'] = np.nan
combined_df.loc[ml_data.shape[0]:, 'forecast'] = future_forecasts[best_model_name].values

# Plot actual vs predicted values for the historical period and future forecast
plt.figure(figsize=(14, 7))

# Plot the entire dataset
plt.plot(combined_df['ds'], combined_df['y'], label='Actual', marker='o')

# Plot the future forecasts
plt.plot(
    combined_df['ds'],
    combined_df['forecast'],
    label=f'Predicted - {best_model_name} (Next 60 Days)',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape} and best parameters: {best_params}")


## StatsForecast

In [14]:
import pandas as pd
import numpy as np
from statsforecast.models import AutoARIMA, AutoETS, AutoCES, AutoTheta, SimpleExponentialSmoothingOptimized
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Extract the 'y' series from train and test dataframes
train_series = train_df['y'].values
test_series = test_df['y'].values

# Define the models
models = {
    'AutoARIMA': AutoARIMA(season_length=12),
    'AutoETS': AutoETS(model='ZZZ', season_length=12),
    'AutoCES': AutoCES(model='Z', season_length=12),
    'AutoTheta': AutoTheta(season_length=12),
    'SESOpt': SimpleExponentialSmoothingOptimized()
}

# Initialize lists to store results
mape_scores = []
forecasts_dict = {}

# Train and evaluate each model
for name, model in models.items():
    model = model.fit(y=train_series)
    forecasts = model.predict(h=len(test_series))
    forecasts_dict[name] = forecasts['mean']
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(test_series, forecasts['mean'])
    mape_scores.append((name, mape))

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = models[best_model_name]

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")

# Train the best model on the entire dataset
best_model = best_model.fit(y=ml_data['y'].values)

# Predict the next 60 days
future_forecasts = best_model.predict(h=60)

# Combine the historical data with the forecast data for plotting
combined_df = ml_data.copy()
future_df = pd.DataFrame({
    'ds': pd.date_range(start=ml_data['ds'].max() + pd.Timedelta(days=1), periods=60, freq='D'),
    'y': np.nan
})
combined_df = pd.concat([combined_df, future_df], ignore_index=True)
combined_df['forecast'] = np.nan
combined_df.loc[ml_data.shape[0]:, 'forecast'] = future_forecasts['mean']

# Plot actual vs predicted values for the historical period and future forecast
plt.figure(figsize=(14, 7))

# Plot the entire dataset
plt.plot(combined_df['ds'], combined_df['y'], label='Actual', marker='o')

# Plot the future forecasts
plt.plot(
    combined_df['ds'],
    combined_df['forecast'],
    label=f'Predicted - {best_model_name} (Next 60 Days)',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")


In [15]:
import pandas as pd
import numpy as np
from statsforecast.models import AutoARIMA, AutoETS, AutoCES, AutoTheta, SimpleExponentialSmoothingOptimized
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Extract the 'y' series from train and test dataframes
train_series = train_df['y'].values
test_series = test_df['y'].values

# Define the models
models = {
    'AutoARIMA': AutoARIMA(season_length=12),
    'AutoETS': AutoETS(model='ZZZ', season_length=12),
    'AutoCES': AutoCES(model='Z', season_length=12),
    'AutoTheta': AutoTheta(season_length=12),
    'SESOpt': SimpleExponentialSmoothingOptimized()
}

# Initialize lists to store results
mape_scores = []
mae_scores = []
forecasts_dict = {}

# Train and evaluate each model
for name, model in models.items():
    model = model.fit(y=train_series)
    forecasts = model.predict(h=len(test_series))
    forecasts_dict[name] = forecasts['mean']
    
    # Calculate MAPE and MAE
    mape = mean_absolute_percentage_error(test_series, forecasts['mean'])
    mae = mean_absolute_error(test_series, forecasts['mean'])
    
    mape_scores.append((name, mape))
    mae_scores.append((name, mae))
    
    print(f"{name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = models[best_model_name]

print(f"\nThe best model is {best_model_name} with a MAPE of {best_mape:.4%}")

# Train the best model on the entire dataset
best_model = best_model.fit(y=ml_data['y'].values)

# Predict the next 60 days
future_forecasts = best_model.predict(h=60)

# Combine the historical data with the forecast data for plotting
combined_df = ml_data.copy()
future_df = pd.DataFrame({
    'ds': pd.date_range(start=ml_data['ds'].max() + pd.Timedelta(days=1), periods=60, freq='D'),
    'y': np.nan
})
combined_df = pd.concat([combined_df, future_df], ignore_index=True)
combined_df['forecast'] = np.nan
combined_df.loc[ml_data.shape[0]:, 'forecast'] = future_forecasts['mean']

# Plot actual vs predicted values for the historical period and future forecast
plt.figure(figsize=(14, 7))

# Plot the entire dataset
plt.plot(combined_df['ds'], combined_df['y'], label='Actual', marker='o')

# Plot the future forecasts
plt.plot(
    combined_df['ds'],
    combined_df['forecast'],
    label=f'Predicted - {best_model_name} (Next 60 Days)',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"\nThe best model is {best_model_name} with a MAPE of {best_mape:.4%}")


In [16]:
import pandas as pd
import numpy as np
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Convert 'ds' column to datetime if necessary
train_df['ds'] = pd.to_datetime(train_df['ds'])
test_df['ds'] = pd.to_datetime(test_df['ds'])
ml_data['ds'] = pd.to_datetime(ml_data['ds'])

# Define the models
models = {
    'LightGBM': lgb.LGBMRegressor(verbosity=-1),
    'XGBoost': xgb.XGBRegressor(verbosity=0),
    'RandomForest': RandomForestRegressor(),
    'LinearRegression': LinearRegression()
}

# Initialize lists to store results
mape_scores = []
mae_scores = []
forecasts_dict = {}

# Define MLForecast object
def create_forecast_object(model):
    return MLForecast(
        models=[model],
        freq='D',
        lags=[7, 14],
        lag_transforms={
            1: [ExpandingMean()],
            7: [RollingMean(window_size=28)]
        },
        date_features=['dayofweek'],
        target_transforms=[Differences([1])]
    )

# Train and evaluate each model
for name, model in models.items():
    fcst = create_forecast_object(model)
    fcst.fit(train_df)
    
    # Make predictions for the test set
    predictions = fcst.predict(len(test_df))
    forecasts_dict[name] = predictions
    
    # Align forecast with test_df to calculate MAE and MAPE
    aligned_forecasts = predictions[name].values[:len(test_df)]
    
    mae = mean_absolute_error(test_df['y'], aligned_forecasts)
    mape = mean_absolute_percentage_error(test_df['y'], aligned_forecasts)
    
    mae_scores.append((name, mae))
    mape_scores.append((name, mape))
    
    print(f"{name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = models[best_model_name]

print(f"\nThe best model is {best_model_name} with a MAPE of {best_mape:.4%}")

# Train the best model on the entire dataset
best_fcst = create_forecast_object(best_model)
best_fcst.fit(ml_data)

# Predict the next 60 days
future_forecasts = best_fcst.predict(60)

# Combine the historical data with the forecast data for plotting
combined_df = ml_data.copy()
future_df = pd.DataFrame({
    'ds': pd.date_range(start=ml_data['ds'].max() + pd.Timedelta(days=1), periods=60, freq='D'),
    'y': np.nan
})
combined_df = pd.concat([combined_df, future_df], ignore_index=True)
combined_df['forecast'] = np.nan
combined_df.loc[ml_data.shape[0]:, 'forecast'] = future_forecasts[best_model_name].values

# Plot actual vs predicted values for the historical period and future forecast
plt.figure(figsize=(14, 7))

# Plot the entire dataset
plt.plot(combined_df['ds'], combined_df['y'], label='Actual', marker='o')

# Plot the future forecasts
plt.plot(
    combined_df['ds'],
    combined_df['forecast'],
    label=f'Predicted - {best_model_name} (Next 60 Days)',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"\nThe best model is {best_model_name} with a MAPE of {best_mape:.4%}")


In [37]:
!pip install catboost

In [17]:
import pandas as pd
import numpy as np
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Assuming you have defined train_df, test_df, and ml_data somewhere in your code

# Convert 'ds' column to datetime if necessary
train_df['ds'] = pd.to_datetime(train_df['ds'])
test_df['ds'] = pd.to_datetime(test_df['ds'])
ml_data['ds'] = pd.to_datetime(ml_data['ds'])

# Define the models
models = {
    'LightGBM': lgb.LGBMRegressor(verbosity=-1),
    'XGBoost': xgb.XGBRegressor(verbosity=0),
    'RandomForest': RandomForestRegressor(),
    'LinearRegression': LinearRegression()
}

# Initialize lists to store results
mape_scores = []
mae_scores = []
forecasts_dict = {}

# Define MLForecast object
def create_forecast_object(model):
    return MLForecast(
        models=[model],
        freq='D',
        lags=[7, 14],
        lag_transforms={
            1: [ExpandingMean()],
            7: [RollingMean(window_size=28)]
        },
        date_features=['dayofweek'],
        target_transforms=[Differences([1])]
    )

# Train and evaluate each model
for name, model in models.items():
    fcst = create_forecast_object(model)
    fcst.fit(train_df)
    
    # Make predictions for the test set
    predictions = fcst.predict(len(test_df))
    predictions.columns = ['unique_id', 'ds', name]
    forecasts_dict[name] = predictions
    
    # Align forecast with test_df to calculate MAE and MAPE
    aligned_forecasts = predictions[name].values[:len(test_df)]
    
    mae = mean_absolute_error(test_df['y'], aligned_forecasts)
    mape = mean_absolute_percentage_error(test_df['y'], aligned_forecasts)
    
    mae_scores.append((name, mae))
    mape_scores.append((name, mape))
    
    print(f"{name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = models[best_model_name]

print(f"\nThe best model is {best_model_name} with a MAPE of {best_mape:.4%}")

# Train the best model on the entire dataset
best_fcst = create_forecast_object(best_model)
best_fcst.fit(ml_data)

# Predict the next 60 days
future_forecasts = best_fcst.predict(60)
future_forecasts.columns = ['unique_id', 'ds', best_model_name]

# Combine the historical data with the forecast data for plotting
combined_df = ml_data.copy()
future_df = pd.DataFrame({
    'ds': pd.date_range(start=ml_data['ds'].max() + pd.Timedelta(days=1), periods=60, freq='D'),
    'y': np.nan
})
combined_df = pd.concat([combined_df, future_df], ignore_index=True)
combined_df['forecast'] = np.nan
combined_df.loc[ml_data.shape[0]:, 'forecast'] = future_forecasts[best_model_name].values

# Plot actual vs predicted values for the historical period and future forecast
plt.figure(figsize=(14, 7))

# Plot the entire dataset
plt.plot(combined_df['ds'], combined_df['y'], label='Actual', marker='o')

# Plot the future forecasts
plt.plot(
    combined_df['ds'],
    combined_df['forecast'],
    label=f'Predicted - {best_model_name} (Next 60 Days)',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"\nThe best model is {best_model_name} with a MAPE of {best_mape:.4%}")


## Machiine Learning Methods

In [41]:
ml = ml_data.copy()
t1 = train_df.copy()
t2 = test_df.copy()

In [18]:
import pandas as pd
import numpy as np
from mlforecast import MLForecast
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = (data.index.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data

# Assuming 'data' is your initial DataFrame
rml_data = data.copy()
rml_data = create_date_features(rml_data)

# Define Features and Models
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': XGBRegressor(objective='reg:squarederror', n_estimators=100),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100)
}

# Specify the features to be used in the model
date_features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
                 'day_of_year', 'week_of_year', 'day_of_week', 'year',
                 'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
                 'sin_day', 'cos_day']

# Define the Forecasting Pipeline
forecast = MLForecast(
    models=models,
    freq='D',
    lags=[7, 14, 30]  # Using 7, 14, 30 days lagged features
)

# Train the Models
forecast.fit(rml_data, id_col='unique_id', time_col='ds', target_col='y')

# Evaluate the Models
# We'll use the last 30 days as the test set for evaluation
train_df = rml_data[:-30]
test_df = rml_data[-30:]
forecast.fit(train_df, id_col='unique_id', time_col='ds', target_col='y')

# Generate predictions for the test set
predictions = forecast.predict(len(test_df))

# Calculate Mean Absolute Percentage Error (MAPE) for each model
mape_scores = {
    model_name: mean_absolute_percentage_error(test_df['y'].values[:30], predictions[model_name].values[:30])
    for model_name in models.keys()
}

# Print MAPE scores
for model_name, mape_score in mape_scores.items():
    print(f"{model_name} MAPE: {mape_score}")

# Select the Best Model
best_model_name = min(mape_scores, key=mape_scores.get)
print(f"The best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]}")

# Retrain the best model on the entire dataset
forecast.models = {best_model_name: models[best_model_name]}
forecast.fit(rml_data, id_col='unique_id', time_col='ds', target_col='y')

# Predict the next 30 days
future_predictions = forecast.predict(30)

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')
plt.plot(test_df['ds'], predictions[best_model_name], label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = (data.index.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data


# Create date features
rml_data = create_date_features(data.copy())

# Define Features and Models
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': XGBRegressor(objective='reg:squarederror', n_estimators=100),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100)
}

# Specify the features to be used in the model
date_features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
                 'day_of_year', 'week_of_year', 'day_of_week', 'year',
                 'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
                 'sin_day', 'cos_day']

# Prepare training data
def create_lagged_features(data, lags):
    for lag in lags:
        data[f'lag_{lag}'] = data['y'].shift(lag)
    return data

lags = [7, 14, 30]
rml_data = create_lagged_features(rml_data, lags).dropna()

X = rml_data[date_features + [f'lag_{lag}' for lag in lags]]
y = rml_data['y']

# Split data into train and test sets
train_X = X[:-30]
train_y = y[:-30]
test_X = X[-30:]
test_y = y[-30:]

# Train and predict using each model
predictions = {}
for model_name, model in models.items():
    model.fit(train_X, train_y)
    predictions[model_name] = model.predict(test_X)

# Calculate Mean Absolute Percentage Error (MAPE) for each model
mape_scores = {
    model_name: mean_absolute_percentage_error(test_y, pred)
    for model_name, pred in predictions.items()
}

# Print MAPE scores
for model_name, mape_score in mape_scores.items():
    print(f"{model_name} MAPE: {mape_score}")

# Select the Best Model
best_model_name = min(mape_scores, key=mape_scores.get)
print(f"The best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]}")

# Retrain the best model on the entire dataset
best_model = models[best_model_name]
best_model.fit(X, y)

# Predict the next 30 days
future_X = X[-30:]  # Assuming that you want to predict the next 30 days using the same features
future_predictions = best_model.predict(future_X)

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_y.index, test_y, label='Actual', marker='o')
plt.plot(test_y.index, predictions[best_model_name], label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [20]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
import holidays

# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = (data.index.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data


# Add holiday feature
uk_holidays = holidays.CountryHoliday('UK')


# Create date features
rml_data = create_date_features(data.copy())
rml_data['is_public_holiday'] = rml_data.index.to_series().apply(lambda date: 1 if date in uk_holidays else 0)

# Define Features and Models
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'XGBoost': XGBRegressor(objective='reg:squarederror', n_estimators=100),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100)
}

# Specify the features to be used in the model
date_features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
                 'day_of_year', 'week_of_year', 'day_of_week', 'year',
                 'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
                 'sin_day', 'cos_day', 'is_public_holiday']

# Prepare training data
def create_lagged_features(data, lags):
    for lag in lags:
        data[f'lag_{lag}'] = data['y'].shift(lag)
    return data

lags = [7, 14, 30]
rml_data = create_lagged_features(rml_data, lags).dropna()

X = rml_data[date_features + [f'lag_{lag}' for lag in lags]]
y = rml_data['y']

# Split data into train and test sets
train_X = X[:-30]
train_y = y[:-30]
test_X = X[-30:]
test_y = y[-30:]

# Train and predict using each model
predictions = {}
for model_name, model in models.items():
    model.fit(train_X, train_y)
    predictions[model_name] = model.predict(test_X)

# Calculate Mean Absolute Percentage Error (MAPE) for each model
mape_scores = {
    model_name: mean_absolute_percentage_error(test_y, pred)
    for model_name, pred in predictions.items()
}

# Print MAPE scores
for model_name, mape_score in mape_scores.items():
    print(f"{model_name} MAPE: {mape_score}")

# Select the Best Model
best_model_name = min(mape_scores, key=mape_scores.get)
print(f"The best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]}")

# Retrain the best model on the entire dataset
best_model = models[best_model_name]
best_model.fit(X, y)

# Predict the next 30 days
future_X = X[-30:]  # Assuming that you want to predict the next 30 days using the same features
future_predictions = best_model.predict(future_X)

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_y.index, test_y, label='Actual', marker='o')
plt.plot(test_y.index, predictions[best_model_name], label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
import holidays

# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = (data.index.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data



# Add holiday feature
uk_holidays = holidays.CountryHoliday('UK')


# Create date features
rml_data = create_date_features(data.copy())
rml_data['is_public_holiday'] = rml_data.index.to_series().apply(lambda date: 1 if date in uk_holidays else 0)

# Define Features and Models
date_features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
                 'day_of_year', 'week_of_year', 'day_of_week', 'year',
                 'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
                 'sin_day', 'cos_day', 'is_public_holiday']

# Prepare training data
def create_lagged_features(data, lags):
    for lag in lags:
        data[f'lag_{lag}'] = data['y'].shift(lag)
    return data

lags = [7, 14, 30]
rml_data = create_lagged_features(rml_data, lags).dropna()

X = rml_data[date_features + [f'lag_{lag}' for lag in lags]]
y = rml_data['y']

# Split data into train and test sets
train_X = X[:-30]
train_y = y[:-30]
test_X = X[-30:]
test_y = y[-30:]

# Define parameter grids for each model
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'num_leaves': [31, 40, 50],
        'learning_rate': [0.01, 0.1, 0.2]
    }
}

# Train and predict using each model with grid search
predictions = {}
best_params = {}

for model_name, model in {'RandomForest': RandomForestRegressor(), 
                          'XGBoost': XGBRegressor(objective='reg:squarederror'), 
                          'LightGBM': lgb.LGBMRegressor()}.items():
    grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='neg_mean_absolute_percentage_error')
    grid_search.fit(train_X, train_y)
    best_params[model_name] = grid_search.best_params_
    predictions[model_name] = grid_search.predict(test_X)
    print(f"Best parameters for {model_name}: {best_params[model_name]}")

# Calculate Mean Absolute Percentage Error (MAPE) for each model
mape_scores = {
    model_name: mean_absolute_percentage_error(test_y, pred)
    for model_name, pred in predictions.items()
}

# Print MAPE scores
for model_name, mape_score in mape_scores.items():
    print(f"{model_name} MAPE: {mape_score}")

# Select the Best Model
best_model_name = min(mape_scores, key=mape_scores.get)
print(f"The best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]}")

# Retrain the best model on the entire dataset with best parameters
best_model = {'RandomForest': RandomForestRegressor(**best_params['RandomForest']),
              'XGBoost': XGBRegressor(objective='reg:squarederror', **best_params['XGBoost']),
              'LightGBM': lgb.LGBMRegressor(**best_params['LightGBM'])}[best_model_name]
best_model.fit(X, y)

# Predict the next 30 days
future_X = X[-30:]  # Assuming that you want to predict the next 30 days using the same features
future_predictions = best_model.predict(future_X)

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_y.index, test_y, label='Actual', marker='o')
plt.plot(test_y.index, predictions[best_model_name], label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import holidays

# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = (data.index.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data

  
# Add holiday feature
uk_holidays = holidays.CountryHoliday('UK')

# Create date features
rml_data = create_date_features(data.copy())
rml_data['is_public_holiday'] = rml_data.index.to_series().apply(lambda date: 1 if date in uk_holidays else 0)

# Define Features and Models
date_features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
                 'day_of_year', 'week_of_year', 'day_of_week', 'year',
                 'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
                 'sin_day', 'cos_day', 'is_public_holiday']

# Prepare training data
def create_lagged_features(data, lags):
    for lag in lags:
        data[f'lag_{lag}'] = data['y'].shift(lag)
    return data

lags = [7, 14, 30]
rml_data = create_lagged_features(rml_data, lags).dropna()

X = rml_data[date_features + [f'lag_{lag}' for lag in lags]]
y = rml_data['y']

# Split data into train and test sets
train_X = X[:-30]
train_y = y[:-30]
test_X = X[-30:]
test_y = y[-30:]

# Define parameter grids for each model
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'num_leaves': [31, 40, 50],
        'learning_rate': [0.01, 0.1, 0.2]
    }
}

# Train and predict using each model with grid search
predictions = {}
best_params = {}
model_mae_mape = []

for model_name, model in {'RandomForest': RandomForestRegressor(), 
                          'XGBoost': XGBRegressor(objective='reg:squarederror'), 
                          'LightGBM': lgb.LGBMRegressor()}.items():
    grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='neg_mean_absolute_percentage_error')
    grid_search.fit(train_X, train_y)
    best_params[model_name] = grid_search.best_params_
    predictions[model_name] = grid_search.predict(test_X)
    print(f"Best parameters for {model_name}: {best_params[model_name]}")
    
    # Calculate MAE and MAPE
    mae = mean_absolute_error(test_y, predictions[model_name])
    mape = mean_absolute_percentage_error(test_y, predictions[model_name])
    model_mae_mape.append({
        'Model': model_name,
        'MAE': mae,
        'MAPE': mape
    })

# Convert the results to a DataFrame
evaluation_df = pd.DataFrame(model_mae_mape)

# Print the MAE and MAPE scores
print(evaluation_df)

# Select the Best Model
best_model_name = min(mape_scores, key=mape_scores.get)
print(f"The best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]}")

# Retrain the best model on the entire dataset with best parameters
best_model = {'RandomForest': RandomForestRegressor(**best_params['RandomForest']),
              'XGBoost': XGBRegressor(objective='reg:squarederror', **best_params['XGBoost']),
              'LightGBM': lgb.LGBMRegressor(**best_params['LightGBM'])}[best_model_name]
best_model.fit(X, y)

# Predict the next 60 days
future_dates = pd.date_range(start=data.index[-1] + pd.Timedelta(days=1), periods=60, freq='D')
future_data = pd.DataFrame(index=future_dates)
future_data = create_date_features(future_data)
future_data['is_public_holiday'] = future_data.index.to_series().apply(lambda date: 1 if date in uk_holidays else 0)

# Iterative prediction for the next 60 days
for date in future_dates:
    lagged_features = {}
    for lag in lags:
        lagged_date = date - pd.Timedelta(days=lag)
        if lagged_date in future_data.index:
            lagged_features[f'lag_{lag}'] = future_data.at[lagged_date, 'Predicted']
        else:
            lagged_features[f'lag_{lag}'] = data.at[lagged_date, 'y'] if lagged_date in data.index else np.nan
    future_data = future_data.assign(**lagged_features)
    current_features = future_data.loc[date, date_features + [f'lag_{lag}' for lag in lags]].to_frame().T
    future_data.at[date, 'Predicted'] = best_model.predict(current_features)[0]

# Combine actual and predicted values
combined_df = pd.DataFrame({
    'Date': future_dates,
    'Predicted': future_data['Predicted']
}).set_index('Date')

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_y.index, test_y, label='Actual', marker='o')
plt.plot(test_y.index, predictions[best_model_name], label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()

print(evaluation_df)


In [3]:
df.head()

In [4]:
x = df.copy()

In [14]:
df = x.copy()

In [5]:
import holidays
# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = data.index.weekday
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data

In [6]:
#Add holidays
 # Determine public holidays
holiday = holidays.UK()
df['is_public_holiday'] = df.index.map(lambda x: 1 if x in holiday else 0)
def create_lag_and_window_features(data, target_col):
    """
    Create lag and window features for a given DataFrame and target column.
    
    Parameters:
    data (pd.DataFrame): The input DataFrame.
    target_col (str): The name of the target column to create features for.

    Returns:
    pd.DataFrame: DataFrame with lag and window features.
    """
    # Create lag features for the past week
    for i in range(1, 8):
        data[f'lag_{i}'] = data[target_col].shift(i)

    # Rolling window statistics for 7 days
    data['rolling_mean_7'] = data[target_col].rolling(window=7).mean()
    data['rolling_sum_7'] = data[target_col].rolling(window=7).sum()
    data['rolling_std_7'] = data[target_col].rolling(window=7).std()

    # Rolling window statistics for 30 days
    data['rolling_mean_30'] = data[target_col].rolling(window=30).mean()
    data['rolling_sum_30'] = data[target_col].rolling(window=30).sum()
    data['rolling_std_30'] = data[target_col].rolling(window=30).std()

    # Expanding window statistics
    data['expanding_sum'] = data[target_col].expanding().sum()

    return data

df = create_lag_and_window_features(df, 'quantity')
df.head(5)

In [7]:
# Fill any remaining NaN values
df = df.fillna(method='bfill')
# Set winsorization threshold
threshold = 60000

# Winsorize the 'quantity' column
df['quantity'] = df['quantity'].where(df['quantity'] <= threshold, threshold)
df.head(5)

In [12]:
import matplotlib.pyplot as plt
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
import pandas as pd
import holidays
from sklearn.metrics import mean_absolute_percentage_error

# 1. Prepare Data
data = df.copy()
data["ds"] = df.index
data["y"] = df["quantity"]
data['ds'] = pd.to_datetime(data['ds'])

# Add is_public_holiday column
holiday = holidays.CountryHoliday('UK')
data['is_public_holiday'] = data['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

# 2. Create and Fit Prophet Model with Best Parameters
best_params = {'changepoint_prior_scale': 0.005, 'seasonality_mode': 'additive', 'seasonality_prior_scale': 0.1, 'holidays_prior_scale': 1.0, 'n_changepoints': 25}

model = Prophet(**best_params)
model.add_regressor('is_public_holiday')
model.fit(data)

# 3. Forecast Future Values
future = model.make_future_dataframe(periods=365)
future['is_public_holiday'] = future['ds'].apply(
    lambda date: 1 if date in holiday else 0
)

forecast = model.predict(future)

# 4. Visualize Results
model.plot(forecast)
model.plot_components(forecast)

# 5. Evaluate Accuracy
df_cv = cross_validation(model, initial='547 days', period='180 days', horizon='30 days')
df_p = performance_metrics(df_cv)
print(df_p.head().round(2))

fig = plot_cross_validation_metric(df_cv, metric='mape')

# Calculate MAPE using yhat and y
mape = mean_absolute_percentage_error(df_cv['y'], df_cv['yhat'])
print(f"MAPE: {round(mape, 4)}")

# 6. Plot Actual vs Predicted Values
plt.figure(figsize=(10, 6))
plt.plot(data['ds'], data['y'], label='Actual')
plt.plot(forecast['ds'], forecast['yhat'], label='Predicted')
plt.xlabel('Date')
plt.ylabel('Quantity Winsorized')
plt.legend()
plt.title('Actual vs Predicted Values')
plt.show()


In [16]:
import holidays
# Function to create date features
def create_date_features(data):
    data["month"] = data.index.month
    data["day_of_month"] = data.index.day
    data["is_month_start"] = data.index.is_month_start.astype(int)
    data["is_month_end"] = data.index.is_month_end.astype(int)
    data["day_of_year"] = data.index.dayofyear
    data["week_of_year"] = data.index.isocalendar().week
    data["day_of_week"] = data.index.dayofweek + 1
    data["year"] = data.index.year
    data["is_weekend"] = data.index.weekday
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data.index.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data.index.dayofweek / 7)
    return data

df = create_date_features(df)
 # Determine public holidays
holiday = holidays.UK()
df['is_public_holiday'] = df.index.map(lambda x: 1 if x in holiday else 0)
def create_lag_and_window_features(data, target_col):
    """
    Create lag and window features for a given DataFrame and target column.
    
    Parameters:
    data (pd.DataFrame): The input DataFrame.
    target_col (str): The name of the target column to create features for.

    Returns:
    pd.DataFrame: DataFrame with lag and window features.
    """
    # Create lag features for the past week
    for i in range(1, 8):
        data[f'lag_{i}'] = data[target_col].shift(i)

    # Rolling window statistics for 7 days
    data['rolling_mean_7'] = data[target_col].rolling(window=7).mean()
    data['rolling_sum_7'] = data[target_col].rolling(window=7).sum()
    data['rolling_std_7'] = data[target_col].rolling(window=7).std()

    # Rolling window statistics for 30 days
    data['rolling_mean_30'] = data[target_col].rolling(window=30).mean()
    data['rolling_sum_30'] = data[target_col].rolling(window=30).sum()
    data['rolling_std_30'] = data[target_col].rolling(window=30).std()

    # Expanding window statistics
    data['expanding_sum'] = data[target_col].expanding().sum()

    return data

df = create_lag_and_window_features(df, 'quantity')


# Fill any remaining NaN values
df = df.fillna(method='bfill')

# Set winsorization threshold
threshold = 60000

# Winsorize the 'quantity' column
df['quantity'] = df['quantity'].where(df['quantity'] <= threshold, threshold)
df.head(5)



In [17]:
import pandas as pd
import numpy as np
import holidays

def create_date_features(data):
    data["month"] = data['ds'].dt.month
    data["day_of_month"] = data['ds'].dt.day
    data["is_month_start"] = data['ds'].dt.is_month_start.astype(int)
    data["is_month_end"] = data['ds'].dt.is_month_end.astype(int)
    data["day_of_year"] = data['ds'].dt.dayofyear
    data["week_of_year"] = data['ds'].dt.isocalendar().week
    data["day_of_week"] = data['ds'].dt.dayofweek + 1
    data["year"] = data['ds'].dt.year
    data["is_weekend"] = (data['ds'].dt.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data['ds'].dt.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data['ds'].dt.dayofweek / 7)
    return data

def create_lag_and_window_features(data, target_col):
    for i in range(1, 8):
        data[f'lag_{i}'] = data[target_col].shift(i)

    data['rolling_mean_7'] = data[target_col].rolling(window=7).mean()
    data['rolling_sum_7'] = data[target_col].rolling(window=7).sum()
    data['rolling_std_7'] = data[target_col].rolling(window=7).std()

    data['rolling_mean_30'] = data[target_col].rolling(window=30).mean()
    data['rolling_sum_30'] = data[target_col].rolling(window=30).sum()
    data['rolling_std_30'] = data[target_col].rolling(window=30).std()

    data['expanding_sum'] = data[target_col].expanding().sum()

    return data

def preprocess_data(df):
    df = df.copy()

    # Ensure 'ds' column is datetime
    df['ds'] = df.index
    df['ds'] = pd.to_datetime(df['ds'])

    # Create date features
    df = create_date_features(df)

    # Determine public holidays
    holiday = holidays.UK()
    df['is_public_holiday'] = df['ds'].apply(lambda x: 1 if x in holiday else 0)

    # Create lag and window features
    df = create_lag_and_window_features(df, 'quantity')

    # Fill any remaining NaN values
    df = df.fillna(method='bfill')

    # Set winsorization threshold
    threshold = 60000

    # Winsorize the 'quantity' column
    df['quantity'] = df['quantity'].where(df['quantity'] <= threshold, threshold)

    return df

In [9]:
x.head()

In [18]:
b = preprocess_data(x)
b.head()

In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import holidays

# Make a copy of the dataframe
rml_data = b.copy()


rml_data['y'] = rml_data['quantity']

# Define features and target
features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
            'day_of_year', 'week_of_year', 'day_of_week', 'year',
            'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
            'sin_day', 'cos_day', 'is_public_holiday', 'lag_1', 'lag_2', 
            'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_mean_7', 
            'rolling_sum_7', 'rolling_std_7', 'rolling_mean_30', 
            'rolling_sum_30', 'rolling_std_30', 'expanding_sum']

target = 'y'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

# Define the models and hyperparameters for tuning
models = {
    'LightGBM': (lgb.LGBMRegressor(), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 127]
    }),
    'XGBoost': (XGBRegressor(objective='reg:squarederror'), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    }),
    'RandomForest': (RandomForestRegressor(), {
        'n_estimators': [100, 200],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [10, 20]
    })
}

# Initialize lists to store the scores
mae_scores = {}
mape_scores = {}

# Train and evaluate each model
for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    mae_scores[model_name] = mae
    mape_scores[model_name] = mape
    
    print(f"{model_name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name = min(mape_scores, key=mape_scores.get)
best_model = models[best_model_name][0]
print(f"\nThe best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]:.4%}")

# Retrain the best model on the entire dataset
best_model.fit(rml_data[features], rml_data[target])

# Predict the next 30 days
future_dates = pd.date_range(start=rml_data['ds'].max() + pd.Timedelta(days=1), periods=30, freq='D')
future_df = pd.DataFrame({
    'ds': future_dates,
    'month': future_dates.month,
    'day_of_month': future_dates.day,
    'is_month_start': future_dates.is_month_start.astype(int),
    'is_month_end': future_dates.is_month_end.astype(int),
    'day_of_year': future_dates.dayofyear,
    'week_of_year': future_dates.isocalendar().week,
    'day_of_week': future_dates.dayofweek + 1,
    'year': future_dates.year,
    'is_weekend': (future_dates.weekday >= 5).astype(int),
    'is_spring': future_dates.month.isin([3, 4, 5]).astype(int),
    'is_summer': future_dates.month.isin([6, 7, 8]).astype(int),
    'is_fall': future_dates.month.isin([9, 10, 11]).astype(int),
    'is_winter': future_dates.month.isin([12, 1, 2]).astype(int),
    'sin_day': np.sin(2 * np.pi * future_dates.dayofweek / 7),
    'cos_day': np.cos(2 * np.pi * future_dates.dayofweek / 7)
})

# Add public holidays for the future dates
uk_holidays = holidays.UK()
future_df['is_public_holiday'] = future_df['ds'].apply(lambda date: 1 if date in uk_holidays else 0)

# Add lag features and rolling statistics for the future_df
for lag in range(1, 8):
    future_df[f'lag_{lag}'] = rml_data[target].shift(lag).iloc[-30:].values

for window in [7, 30]:
    future_df[f'rolling_mean_{window}'] = rml_data[target].rolling(window=window).mean().iloc[-30:].values
    future_df[f'rolling_sum_{window}'] = rml_data[target].rolling(window=window).sum().iloc[-30:].values
    future_df[f'rolling_std_{window}'] = rml_data[target].rolling(window=window).std().iloc[-30:].values

future_df['expanding_sum'] = rml_data[target].expanding().sum().iloc[-30:].values

# Predict future values using the best model
future_predictions = best_model.predict(future_df[features])

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')
plt.plot(future_df['ds'], future_predictions, label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import holidays

# Make a copy of the dataframe
rml_data = b.copy()

# Ensure 'ds' column is datetime
rml_data['ds'] = pd.to_datetime(rml_data['ds'])
rml_data['y'] = rml_data['quantity']

# Define features and target
features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
            'day_of_year', 'week_of_year', 'day_of_week', 'year',
            'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
            'sin_day', 'cos_day', 'is_public_holiday', 'lag_1', 'lag_2', 
            'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_mean_7', 
            'rolling_sum_7', 'rolling_std_7', 'rolling_mean_30', 
            'rolling_sum_30', 'rolling_std_30', 'expanding_sum']

target = 'y'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

# Define the models and hyperparameters for tuning
models = {
    'LightGBM': (lgb.LGBMRegressor(), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 127]
    }),
    'XGBoost': (XGBRegressor(objective='reg:squarederror'), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    }),
    'RandomForest': (RandomForestRegressor(), {
        'n_estimators': [100, 200],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [10, 20]
    })
}

# Initialize lists to store the scores
mae_scores = {}
mape_scores = {}
best_params = {}

# Train and evaluate each model
for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_
    y_pred = best_model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    mae_scores[model_name] = mae
    mape_scores[model_name] = mape
    
    print(f"{model_name} - Best Params: {grid_search.best_params_}")
    print(f"{model_name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name = min(mape_scores, key=mape_scores.get)
best_model = models[best_model_name][0]
print(f"\nThe best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]:.4%}")
print(f"Best Hyperparameters: {best_params[best_model_name]}")

# Retrain the best model on the entire dataset
best_model.set_params(**best_params[best_model_name])
best_model.fit(rml_data[features], rml_data[target])

# Predict the next 30 days
future_dates = pd.date_range(start=rml_data['ds'].max() + pd.Timedelta(days=1), periods=30, freq='D')
future_df = pd.DataFrame({
    'ds': future_dates,
    'month': future_dates.month,
    'day_of_month': future_dates.day,
    'is_month_start': future_dates.is_month_start.astype(int),
    'is_month_end': future_dates.is_month_end.astype(int),
    'day_of_year': future_dates.dayofyear,
    'week_of_year': future_dates.isocalendar().week,
    'day_of_week': future_dates.dayofweek + 1,
    'year': future_dates.year,
    'is_weekend': (future_dates.weekday >= 5).astype(int),
    'is_spring': future_dates.month.isin([3, 4, 5]).astype(int),
    'is_summer': future_dates.month.isin([6, 7, 8]).astype(int),
    'is_fall': future_dates.month.isin([9, 10, 11]).astype(int),
    'is_winter': future_dates.month.isin([12, 1, 2]).astype(int),
    'sin_day': np.sin(2 * np.pi * future_dates.dayofweek / 7),
    'cos_day': np.cos(2 * np.pi * future_dates.dayofweek / 7)
})

# Add public holidays for the future dates
uk_holidays = holidays.UK()
future_df['is_public_holiday'] = future_df['ds'].apply(lambda date: 1 if date in uk_holidays else 0)

# Add lag features and rolling statistics for the future_df
for lag in range(1, 8):
    future_df[f'lag_{lag}'] = rml_data[target].shift(lag).iloc[-30:].values

for window in [7, 30]:
    future_df[f'rolling_mean_{window}'] = rml_data[target].rolling(window=window).mean().iloc[-30:].values
    future_df[f'rolling_sum_{window}'] = rml_data[target].rolling(window=window).sum().iloc[-30:].values
    future_df[f'rolling_std_{window}'] = rml_data[target].rolling(window=window).std().iloc[-30:].values

future_df['expanding_sum'] = rml_data[target].expanding().sum().iloc[-30:].values

# Predict future values using the best model
future_predictions = best_model.predict(future_df[features])

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(rml_data['ds'], rml_data['y'], label='Actual', marker='o')
plt.plot(future_df['ds'], future_predictions, label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [15]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import holidays

# Make a copy of the dataframe
rml_data = b.copy()

# Ensure 'ds' column is datetime
rml_data['ds'] = pd.to_datetime(rml_data['ds'])
rml_data['y'] = rml_data['quantity']

# Define features and target
features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
            'day_of_year', 'week_of_year', 'day_of_week', 'year',
            'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
            'sin_day', 'cos_day', 'is_public_holiday', 'lag_1', 'lag_2', 
            'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_mean_7', 
            'rolling_sum_7', 'rolling_std_7', 'rolling_mean_30', 
            'rolling_sum_30', 'rolling_std_30', 'expanding_sum']

target = 'y'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

# Define the best parameters for the XGBoost model
best_params = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'max_depth': 3
}

# Train the XGBoost model with the best parameters
best_model = XGBRegressor(objective='reg:squarederror', **best_params)
best_model.fit(X_train, y_train)

# Predict the entire historical period
predictions = best_model.predict(rml_data[features])

# Calculate MAPE for the predictions
mape = mean_absolute_percentage_error(rml_data[target], predictions)
print(f"MAPE: {mape:.4%}")

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))

# Plot the actual values
plt.plot(rml_data['ds'], rml_data['y'], label='Actual')

# Plot the predicted values
plt.plot(rml_data['ds'], predictions, label='Predicted')

plt.xlabel('Date')
plt.ylabel('Quantity')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()

# Print the best hyperparameters
print(f"Best Hyperparameters: {best_params}")
