# Sales Forecasting

In [5]:
#Import the neccesary libraries
import warnings
warnings.filterwarnings('ignore')
import random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gr
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

In [6]:
df = pd.read_csv("processed.csv", parse_dates=['date'], index_col='date')
df =df.asfreq('D')
df =df[["quantity"]]
df.head()

In [7]:
import pandas as pd
import numpy as np
import holidays

def create_date_features(data):
    data["month"] = data['ds'].dt.month
    data["day_of_month"] = data['ds'].dt.day
    data["is_month_start"] = data['ds'].dt.is_month_start.astype(int)
    data["is_month_end"] = data['ds'].dt.is_month_end.astype(int)
    data["day_of_year"] = data['ds'].dt.dayofyear
    data["week_of_year"] = data['ds'].dt.isocalendar().week
    data["day_of_week"] = data['ds'].dt.dayofweek + 1
    data["year"] = data['ds'].dt.year
    data["is_weekend"] = (data['ds'].dt.weekday >= 5).astype(int)
    data['is_spring'] = data['month'].isin([3, 4, 5]).astype(int)
    data['is_summer'] = data['month'].isin([6, 7, 8]).astype(int)
    data['is_fall'] = data['month'].isin([9, 10, 11]).astype(int)
    data['is_winter'] = data['month'].isin([12, 1, 2]).astype(int)
    data['sin_day'] = np.sin(2 * np.pi * data['ds'].dt.dayofweek / 7)
    data['cos_day'] = np.cos(2 * np.pi * data['ds'].dt.dayofweek / 7)
    return data

def create_lag_and_window_features(data, target_col):
    for i in range(1, 8):
        data[f'lag_{i}'] = data[target_col].shift(i)

    data['rolling_mean_7'] = data[target_col].rolling(window=7).mean()
    data['rolling_sum_7'] = data[target_col].rolling(window=7).sum()
    data['rolling_std_7'] = data[target_col].rolling(window=7).std()

    data['rolling_mean_30'] = data[target_col].rolling(window=30).mean()
    data['rolling_sum_30'] = data[target_col].rolling(window=30).sum()
    data['rolling_std_30'] = data[target_col].rolling(window=30).std()

    data['expanding_sum'] = data[target_col].expanding().sum()

    return data

def preprocess_data(df):
    df = df.copy()

    # Ensure 'ds' column is datetime
    df['ds'] = df.index
    df['ds'] = pd.to_datetime(df['ds'])

    # Create date features
    df = create_date_features(df)

    # Determine public holidays
    holiday = holidays.UK()
    df['is_public_holiday'] = df['ds'].apply(lambda x: 1 if x in holiday else 0)

    # Create lag and window features
    df = create_lag_and_window_features(df, 'quantity')

    # Fill any remaining NaN values
    df = df.fillna(method='bfill')

    # Set winsorization threshold
    threshold = 60000

    # Winsorize the 'quantity' column
    df['quantity'] = df['quantity'].where(df['quantity'] <= threshold, threshold)
    df['y'] = df['quantity']

    return df

In [8]:
b = preprocess_data(df)
b.head()

## ML

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import holidays

# Make a copy of the dataframe
rml_data = b.copy()


# Define features and target
features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
            'day_of_year', 'week_of_year', 'day_of_week', 'year',
            'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
            'sin_day', 'cos_day', 'is_public_holiday', 'lag_1', 'lag_2', 
            'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_mean_7', 
            'rolling_sum_7', 'rolling_std_7', 'rolling_mean_30', 
            'rolling_sum_30', 'rolling_std_30', 'expanding_sum']

target = 'y'

# Split the data into train and test sets
train_df = rml_data[:-60]
test_df = rml_data[-60:]

X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

# Define the models
models = {
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, verbosity=-1),
    'XGBoost': XGBRegressor(n_estimators=100, verbosity=0),
    'RandomForest': RandomForestRegressor(n_estimators=100)
}

# Initialize lists to store the scores
mae_scores = {}
mape_scores = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    mae_scores[model_name] = mae
    mape_scores[model_name] = mape
    
    print(f"{model_name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name = min(mape_scores, key=mape_scores.get)
best_model = models[best_model_name]
print(f"\nThe best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]:.4%}")

# Retrain the best model on the entire dataset
best_model.fit(rml_data[features], rml_data[target])

# Predict the next 60 days
future_dates = pd.date_range(start=rml_data['ds'].max() + pd.Timedelta(days=1), periods=60, freq='D')
future_df = pd.DataFrame({
    'ds': future_dates,
    'month': future_dates.month,
    'day_of_month': future_dates.day,
    'is_month_start': future_dates.is_month_start.astype(int),
    'is_month_end': future_dates.is_month_end.astype(int),
    'day_of_year': future_dates.dayofyear,
    'week_of_year': future_dates.isocalendar().week,
    'day_of_week': future_dates.dayofweek + 1,
    'year': future_dates.year,
    'is_weekend': (future_dates.weekday >= 5).astype(int),
    'is_spring': future_dates.month.isin([3, 4, 5]).astype(int),
    'is_summer': future_dates.month.isin([6, 7, 8]).astype(int),
    'is_fall': future_dates.month.isin([9, 10, 11]).astype(int),
    'is_winter': future_dates.month.isin([12, 1, 2]).astype(int),
    'sin_day': np.sin(2 * np.pi * future_dates.dayofweek / 7),
    'cos_day': np.cos(2 * np.pi * future_dates.dayofweek / 7)
})

# Add public holidays for the future dates
uk_holidays = holidays.UK()
future_df['is_public_holiday'] = future_df['ds'].apply(lambda date: 1 if date in uk_holidays else 0)

# Add lag features and rolling statistics for the future_df
for lag in range(1, 8):
    future_df[f'lag_{lag}'] = rml_data[target].shift(lag).iloc[-60:].values

for window in [7, 30]:
    future_df[f'rolling_mean_{window}'] = rml_data[target].rolling(window=window).mean().iloc[-60:].values
    future_df[f'rolling_sum_{window}'] = rml_data[target].rolling(window=window).sum().iloc[-60:].values
    future_df[f'rolling_std_{window}'] = rml_data[target].rolling(window=window).std().iloc[-60:].values

future_df['expanding_sum'] = rml_data[target].expanding().sum().iloc[-60:].values

# Predict future values using the best model
future_predictions = best_model.predict(future_df[features])

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')
plt.plot(future_df['ds'], future_predictions, label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import holidays

# Make a copy of the dataframe
rml_data = b.copy()


# Define features and target
features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
            'day_of_year', 'week_of_year', 'day_of_week', 'year',
            'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
            'sin_day', 'cos_day', 'is_public_holiday', 'lag_1', 'lag_2', 
            'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_mean_7', 
            'rolling_sum_7', 'rolling_std_7', 'rolling_mean_30', 
            'rolling_sum_30', 'rolling_std_30', 'expanding_sum']

target = 'y'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

# Define the models
models = {
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, verbosity=-1),
    'XGBoost': XGBRegressor(n_estimators=100, verbosity=0),
    'RandomForest': RandomForestRegressor(n_estimators=100)
}

# Initialize lists to store the scores
mae_scores = {}
mape_scores = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    mae_scores[model_name] = mae
    mape_scores[model_name] = mape
    
    print(f"{model_name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name = min(mape_scores, key=mape_scores.get)
best_model = models[best_model_name]
print(f"\nThe best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]:.4%}")

# Retrain the best model on the entire dataset
best_model.fit(rml_data[features], rml_data[target])

# Predict the next 30 days
future_dates = pd.date_range(start=rml_data['ds'].max() + pd.Timedelta(days=1), periods=30, freq='D')
future_df = pd.DataFrame({
    'ds': future_dates,
    'month': future_dates.month,
    'day_of_month': future_dates.day,
    'is_month_start': future_dates.is_month_start.astype(int),
    'is_month_end': future_dates.is_month_end.astype(int),
    'day_of_year': future_dates.dayofyear,
    'week_of_year': future_dates.isocalendar().week,
    'day_of_week': future_dates.dayofweek + 1,
    'year': future_dates.year,
    'is_weekend': (future_dates.weekday >= 5).astype(int),
    'is_spring': future_dates.month.isin([3, 4, 5]).astype(int),
    'is_summer': future_dates.month.isin([6, 7, 8]).astype(int),
    'is_fall': future_dates.month.isin([9, 10, 11]).astype(int),
    'is_winter': future_dates.month.isin([12, 1, 2]).astype(int),
    'sin_day': np.sin(2 * np.pi * future_dates.dayofweek / 7),
    'cos_day': np.cos(2 * np.pi * future_dates.dayofweek / 7)
})

# Add public holidays for the future dates
uk_holidays = holidays.UK()
future_df['is_public_holiday'] = future_df['ds'].apply(lambda date: 1 if date in uk_holidays else 0)

# Add lag features and rolling statistics for the future_df
for lag in range(1, 8):
    future_df[f'lag_{lag}'] = rml_data[target].shift(lag).iloc[-30:].values

for window in [7, 30]:
    future_df[f'rolling_mean_{window}'] = rml_data[target].rolling(window=window).mean().iloc[-30:].values
    future_df[f'rolling_sum_{window}'] = rml_data[target].rolling(window=window).sum().iloc[-30:].values
    future_df[f'rolling_std_{window}'] = rml_data[target].rolling(window=window).std().iloc[-30:].values

future_df['expanding_sum'] = rml_data[target].expanding().sum().iloc[-30:].values

# Predict future values using the best model
future_predictions = best_model.predict(future_df[features])

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')
plt.plot(future_df['ds'], future_predictions, label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import holidays

# Make a copy of the dataframe
rml_data = b.copy()

# Ensure 'ds' column is datetime
rml_data['ds'] = pd.to_datetime(rml_data['ds'])

# Define features and target
features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
            'day_of_year', 'week_of_year', 'day_of_week', 'year',
            'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
            'sin_day', 'cos_day', 'is_public_holiday', 'lag_1', 'lag_2', 
            'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_mean_7', 
            'rolling_sum_7', 'rolling_std_7', 'rolling_mean_30', 
            'rolling_sum_30', 'rolling_std_30', 'expanding_sum']

target = 'y'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

# Define the models and hyperparameters for tuning
models = {
    'LightGBM': (lgb.LGBMRegressor(), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 127]
    }),
    'XGBoost': (XGBRegressor(objective='reg:squarederror'), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    }),
    'RandomForest': (RandomForestRegressor(), {
        'n_estimators': [100, 200],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [10, 20]
    })
}

# Initialize lists to store the scores
mae_scores = {}
mape_scores = {}

# Train and evaluate each model
for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    mae_scores[model_name] = mae
    mape_scores[model_name] = mape
    
    print(f"{model_name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name = min(mape_scores, key=mape_scores.get)
best_model = models[best_model_name][0]
print(f"\nThe best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]:.4%}")

# Retrain the best model on the entire dataset
best_model.fit(rml_data[features], rml_data[target])

# Predict the next 30 days
future_dates = pd.date_range(start=rml_data['ds'].max() + pd.Timedelta(days=1), periods=30, freq='D')
future_df = pd.DataFrame({
    'ds': future_dates,
    'month': future_dates.month,
    'day_of_month': future_dates.day,
    'is_month_start': future_dates.is_month_start.astype(int),
    'is_month_end': future_dates.is_month_end.astype(int),
    'day_of_year': future_dates.dayofyear,
    'week_of_year': future_dates.isocalendar().week,
    'day_of_week': future_dates.dayofweek + 1,
    'year': future_dates.year,
    'is_weekend': (future_dates.weekday >= 5).astype(int),
    'is_spring': future_dates.month.isin([3, 4, 5]).astype(int),
    'is_summer': future_dates.month.isin([6, 7, 8]).astype(int),
    'is_fall': future_dates.month.isin([9, 10, 11]).astype(int),
    'is_winter': future_dates.month.isin([12, 1, 2]).astype(int),
    'sin_day': np.sin(2 * np.pi * future_dates.dayofweek / 7),
    'cos_day': np.cos(2 * np.pi * future_dates.dayofweek / 7)
})

# Add public holidays for the future dates
uk_holidays = holidays.UK()
future_df['is_public_holiday'] = future_df['ds'].apply(lambda date: 1 if date in uk_holidays else 0)

# Add lag features and rolling statistics for the future_df
for lag in range(1, 8):
    future_df[f'lag_{lag}'] = rml_data[target].shift(lag).iloc[-30:].values

for window in [7, 30]:
    future_df[f'rolling_mean_{window}'] = rml_data[target].rolling(window=window).mean().iloc[-30:].values
    future_df[f'rolling_sum_{window}'] = rml_data[target].rolling(window=window).sum().iloc[-30:].values
    future_df[f'rolling_std_{window}'] = rml_data[target].rolling(window=window).std().iloc[-30:].values

future_df['expanding_sum'] = rml_data[target].expanding().sum().iloc[-30:].values

# Predict future values using the best model
future_predictions = best_model.predict(future_df[features])

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')
plt.plot(future_df['ds'], future_predictions, label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import holidays

# Make a copy of the dataframe
rml_data = b.copy()

# Ensure 'ds' column is datetime
rml_data['ds'] = pd.to_datetime(rml_data['ds'])

# Define features and target
features = ['month', 'day_of_month', 'is_month_start', 'is_month_end',
            'day_of_year', 'week_of_year', 'day_of_week', 'year',
            'is_weekend', 'is_spring', 'is_summer', 'is_fall', 'is_winter',
            'sin_day', 'cos_day', 'is_public_holiday', 'lag_1', 'lag_2', 
            'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'rolling_mean_7', 
            'rolling_sum_7', 'rolling_std_7', 'rolling_mean_30', 
            'rolling_sum_30', 'rolling_std_30', 'expanding_sum']

target = 'y'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

# Define the models and hyperparameters for tuning
models = {
    'LightGBM': (lgb.LGBMRegressor(), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 127]
    }),
    'XGBoost': (XGBRegressor(objective='reg:squarederror'), {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    }),
    'RandomForest': (RandomForestRegressor(), {
        'n_estimators': [100, 200],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [10, 20]
    })
}

# Initialize lists to store the scores
mae_scores = {}
mape_scores = {}

# Train and evaluate each model
for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    mae_scores[model_name] = mae
    mape_scores[model_name] = mape
    
    print(f"{model_name} - MAE: {mae:.4f}, MAPE: {mape:.4%}")

# Select the best model based on MAPE
best_model_name = min(mape_scores, key=mape_scores.get)
best_model = models[best_model_name][0]
print(f"\nThe best model is {best_model_name} with a MAPE of {mape_scores[best_model_name]:.4%}")

# Retrain the best model on the entire dataset
best_model.fit(rml_data[features], rml_data[target])

# Predict the next 30 days
future_dates = pd.date_range(start=rml_data['ds'].max() + pd.Timedelta(days=1), periods=30, freq='D')
future_df = pd.DataFrame({
    'ds': future_dates,
    'month': future_dates.month,
    'day_of_month': future_dates.day,
    'is_month_start': future_dates.is_month_start.astype(int),
    'is_month_end': future_dates.is_month_end.astype(int),
    'day_of_year': future_dates.dayofyear,
    'week_of_year': future_dates.isocalendar().week,
    'day_of_week': future_dates.dayofweek + 1,
    'year': future_dates.year,
    'is_weekend': (future_dates.weekday >= 5).astype(int),
    'is_spring': future_dates.month.isin([3, 4, 5]).astype(int),
    'is_summer': future_dates.month.isin([6, 7, 8]).astype(int),
    'is_fall': future_dates.month.isin([9, 10, 11]).astype(int),
    'is_winter': future_dates.month.isin([12, 1, 2]).astype(int),
    'sin_day': np.sin(2 * np.pi * future_dates.dayofweek / 7),
    'cos_day': np.cos(2 * np.pi * future_dates.dayofweek / 7)
})

# Add public holidays for the future dates
uk_holidays = holidays.UK()
future_df['is_public_holiday'] = future_df['ds'].apply(lambda date: 1 if date in uk_holidays else 0)

# Add lag features and rolling statistics for the future_df
for lag in range(1, 8):
    future_df[f'lag_{lag}'] = rml_data[target].shift(lag).iloc[-30:].values

for window in [7, 30]:
    future_df[f'rolling_mean_{window}'] = rml_data[target].rolling(window=window).mean().iloc[-30:].values
    future_df[f'rolling_sum_{window}'] = rml_data[target].rolling(window=window).sum().iloc[-30:].values
    future_df[f'rolling_std_{window}'] = rml_data[target].rolling(window=window).std().iloc[-30:].values

future_df['expanding_sum'] = rml_data[target].expanding().sum().iloc[-30:].values

# Predict future values using the best model
future_predictions = best_model.predict(future_df[features])

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')
plt.plot(future_df['ds'], future_predictions, label=f'Predicted - {best_model_name}', marker='x')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [16]:
import pandas as pd
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, NBEATS, LSTM
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Make a copy of the dataframe
rml_data = b.copy()

# Ensure 'ds' column is datetime
rml_data['ds'] = pd.to_datetime(rml_data['ds'])

# Add a unique_id column
rml_data['unique_id'] = 'series_1'

# Split the data into train and test sets
train_df = rml_data[:-60]
test_df = rml_data[-60:]

# Define the models and their respective parameters
nhits_params = {
    'h': 60,
    'input_size': 30,
    'max_steps': 50
}

nbeats_params = {
    'h': 60,
    'input_size': 30,
    'max_steps': 50
}

lstm_params = {
    'h': 60,
    'input_size': 30,
    'max_steps': 50
}

# Initialize the models
nhits_model = NHITS(**nhits_params)
nbeats_model = NBEATS(**nbeats_params)
lstm_model = LSTM(**lstm_params)

# Train each model on the training data and evaluate on test data
models = [nhits_model, nbeats_model, lstm_model]
model_names = ['NHITS', 'NBEATS', 'LSTM']
mape_scores = []

for model, name in zip(models, model_names):
    nf = NeuralForecast(models=[model], freq='D')
    nf.fit(df=train_df, id_col='unique_id', time_col='ds', target_col='y')
    forecasts = nf.predict(futr_df=test_df)
    print(forecasts.head())
    mape = mean_absolute_percentage_error(test_df['y'], forecasts[name])
    mape_scores.append((name, mape))

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = None

if best_model_name == 'NHITS':
    best_model = NHITS(**nhits_params)
elif best_model_name == 'NBEATS':
    best_model = NBEATS(**nbeats_params)
elif best_model_name == 'LSTM':
    best_model = LSTM(**lstm_params)

# Train the best model on the entire dataset
nf_best_model = NeuralForecast(models=[best_model], freq='D')
nf_best_model.fit(df=rml_data, id_col='unique_id', time_col='ds', target_col='y')

# Predict the next 60 days with the best model
future_dates = pd.date_range(rml_data['ds'].max() + pd.Timedelta(days=1), periods=60, freq='D')
future_df = pd.DataFrame({'ds': future_dates})
future_df['unique_id'] = 'series_1'

final_forecasts = nf_best_model.predict(futr_df=future_df)

# Plot actual vs predicted values for the best model
plt.figure(figsize=(14, 7))

# Plot the test data
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')

# Plot the forecasts
plt.plot(
    future_dates,
    final_forecasts[best_model_name][:60],  # Ensure we take only 60 predictions
    label=f'Predicted - {best_model_name}',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")


In [17]:
import pandas as pd
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, NBEATS, LSTM
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

# Make a copy of the dataframe
rml_data = b.copy()

# Ensure 'ds' column is datetime
rml_data['ds'] = pd.to_datetime(rml_data['ds'])

# Add a unique_id column
rml_data['unique_id'] = 'series_1'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

# Define the models and their respective parameters
nhits_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

nbeats_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

lstm_params = {
    'h': 30,
    'input_size': 30,
    'max_steps': 50
}

# Initialize the models
nhits_model = NHITS(**nhits_params)
nbeats_model = NBEATS(**nbeats_params)
lstm_model = LSTM(**lstm_params)

# Train each model on the training data and evaluate on test data
models = [nhits_model, nbeats_model, lstm_model]
model_names = ['NHITS', 'NBEATS', 'LSTM']
mape_scores = []

for model, name in zip(models, model_names):
    nf = NeuralForecast(models=[model], freq='D')
    nf.fit(df=train_df, id_col='unique_id', time_col='ds', target_col='y')
    forecasts = nf.predict(futr_df=test_df)
    print(forecasts.head())
    mape = mean_absolute_percentage_error(test_df['y'], forecasts[name])
    mape_scores.append((name, mape))

# Select the best model based on MAPE
best_model_name, best_mape = min(mape_scores, key=lambda x: x[1])
best_model = None

if best_model_name == 'NHITS':
    best_model = NHITS(**nhits_params)
elif best_model_name == 'NBEATS':
    best_model = NBEATS(**nbeats_params)
elif best_model_name == 'LSTM':
    best_model = LSTM(**lstm_params)

# Train the best model on the entire dataset
nf_best_model = NeuralForecast(models=[best_model], freq='D')
nf_best_model.fit(df=rml_data, id_col='unique_id', time_col='ds', target_col='y')

# Predict the next 30 days with the best model
future_dates = pd.date_range(rml_data['ds'].max() + pd.Timedelta(days=1), periods=30, freq='D')
future_df = pd.DataFrame({'ds': future_dates})
future_df['unique_id'] = 'series_1'

final_forecasts = nf_best_model.predict(futr_df=future_df)

# Plot actual vs predicted values for the best model
plt.figure(figsize=(14, 7))

# Plot the test data
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')

# Plot the forecasts
plt.plot(
    future_dates,
    final_forecasts[best_model_name][:30],  # Ensure we take only 30 predictions
    label=f'Predicted - {best_model_name}',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")


In [18]:
import pandas as pd
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, NBEATS, LSTM
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt

# Make a copy of the dataframe
rml_data = b.copy()

# Ensure 'ds' column is datetime
rml_data['ds'] = pd.to_datetime(rml_data['ds'])

# Add a unique_id column
rml_data['unique_id'] = 'series_1'

# Split the data into train and test sets
train_df = rml_data[:-30]
test_df = rml_data[-30:]

# Define the parameter grids for each model
nhits_param_grid = {
    'h': [30],
    'input_size': [30, 60, 90],
    'max_steps': [50, 100, 150]
}

nbeats_param_grid = {
    'h': [30],
    'input_size': [30, 60, 90],
    'max_steps': [50, 100, 150]
}

lstm_param_grid = {
    'h': [30],
    'input_size': [30, 60, 90],
    'max_steps': [50, 100, 150]
}

# Initialize the parameter grids
nhits_params_list = list(ParameterGrid(nhits_param_grid))
nbeats_params_list = list(ParameterGrid(nbeats_param_grid))
lstm_params_list = list(ParameterGrid(lstm_param_grid))

# Function to train and evaluate a model with given parameters
def train_evaluate_model(model_class, param_list, train_df, test_df, model_name):
    best_params = None
    best_mape = float('inf')
    best_forecasts = None

    for params in param_list:
        model = model_class(**params)
        nf = NeuralForecast(models=[model], freq='D')
        nf.fit(df=train_df, id_col='unique_id', time_col='ds', target_col='y')
        forecasts = nf.predict(futr_df=test_df)
        
        mape = mean_absolute_percentage_error(test_df['y'], forecasts[model_name])
        
        if mape < best_mape:
            best_mape = mape
            best_params = params
            best_forecasts = forecasts
    
    return best_params, best_mape, best_forecasts

# Fine-tune and evaluate each model
nhits_best_params, nhits_best_mape, nhits_best_forecasts = train_evaluate_model(
    NHITS, nhits_params_list, train_df, test_df, 'NHITS'
)

nbeats_best_params, nbeats_best_mape, nbeats_best_forecasts = train_evaluate_model(
    NBEATS, nbeats_params_list, train_df, test_df, 'NBEATS'
)

lstm_best_params, lstm_best_mape, lstm_best_forecasts = train_evaluate_model(
    LSTM, lstm_params_list, train_df, test_df, 'LSTM'
)

# Store the best models and their MAPE scores
best_models = [
    ('NHITS', nhits_best_mape, NHITS, nhits_best_params, nhits_best_forecasts),
    ('NBEATS', nbeats_best_mape, NBEATS, nbeats_best_params, nbeats_best_forecasts),
    ('LSTM', lstm_best_mape, LSTM, lstm_best_params, lstm_best_forecasts)
]

# Select the best model based on MAPE
best_model_name, best_mape, best_model_class, best_model_params, best_forecasts = min(
    best_models, key=lambda x: x[1]
)

# Train the best model on the entire dataset with the best parameters
best_model = best_model_class(**best_model_params)
nf_best_model = NeuralForecast(models=[best_model], freq='D')
nf_best_model.fit(df=rml_data, id_col='unique_id', time_col='ds', target_col='y')

# Predict the next 30 days with the best model
future_dates = pd.date_range(rml_data['ds'].max() + pd.Timedelta(days=1), periods=30, freq='D')
future_df = pd.DataFrame({'ds': future_dates})
future_df['unique_id'] = 'series_1'

final_forecasts = nf_best_model.predict(futr_df=future_df)

# Plot actual vs predicted values for the best model
plt.figure(figsize=(14, 7))

# Plot the test data
plt.plot(test_df['ds'], test_df['y'], label='Actual', marker='o')

# Plot the forecasts
plt.plot(
    future_dates,
    final_forecasts[best_model_name][:30],  # Ensure we take only 30 predictions
    label=f'Predicted - {best_model_name}',
    marker='x',
)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title(f'Actual vs Predicted Values using {best_model_name}')
plt.legend()
plt.grid(True)
plt.show()

print(f"The best model is {best_model_name} with a MAPE of {best_mape}")
print(f"Best parameters: {best_model_params}")
