In [18]:
# Install Catboost
! pip install catboost



In [19]:
# Load Libraries

import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [20]:
# Define custom function which returns single output as metric score

def calculate_smape(actual, predicted) -> float:

    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray),
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual),
        np.array(predicted)

    return round(
        np.mean(
            np.abs(predicted - actual) /
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 2
    )


#make scorer from custome function
smape_scorer = make_scorer(calculate_smape)

In [21]:
# Define custom function which returns single output as metric score
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

## **Window Frames method with different models**

In [22]:
# Function to Split data to window frames

def df_to_X_y(df, window_size=3):
  df_as_np=df.to_numpy()
  X = []
  y = []
  for i in range(len(df_as_np)-window_size):
    row = df_as_np[i:i+window_size]
    X.append(row)
    label = df_as_np[i+window_size]
    y.append(label)
  return np.array(X), np.array(y)

In [23]:
# Models

model_names = [
    "Decision Tree", "Random Forest", "Linear Regression", "Gradient Boosting", "Neural Network",
    "XGBoost", "LightGBM", "CatBoost"
]
models = [
    DecisionTreeRegressor(), RandomForestRegressor(), LinearRegression(), GradientBoostingRegressor(), MLPRegressor(),
    XGBRegressor(), LGBMRegressor(verbose=-1), CatBoostRegressor(verbose=0)
]


In [24]:
for model_name, model in zip(model_names, models):

  Pred=[]
  actuals=[]
  # Loop through each dataset
  for i in os.listdir("/content/drive/MyDrive/filtered"):
    path="/content/drive/MyDrive/filtered/"+i
    # Read the dataset
    df=pd.read_csv(path)
    df=df.iloc[:-1,:]
    df.ffill(inplace=True)
    y_test=list(df.iloc[-26:, :]["Levels"].values)
    for yss in y_test:
      actuals.append(yss)
    df=df.iloc[:-26,:]
    # Split to window frames of 3 and train model
    X,y= df_to_X_y(df["Levels"], window_size=3)
    model.fit(X, y)
    lst=list(df["Levels"].tail(3).values)

    # Model Prediction for testing
    for ii in range(26):
            result=model.predict([lst])
            Pred.append(round(result[0], 2))
            lst.pop(0)
            lst.append(result[0])

  # Getting SMAPE Scores for models
  test=pd.DataFrame({"Predictions":Pred, "Actuals":actuals}).dropna()

  A = test["Actuals"]
  F = test["Predictions"]
  print(model_name+" SMAPE SCore: ", smape(A, F))

Decision Tree SMAPE SCore:  0.17204508212063682
Random Forest SMAPE SCore:  0.15550039920179773
Linear Regression SMAPE SCore:  0.13859446406155654
Gradient Boosting SMAPE SCore:  0.13920713162429618
Neural Network SMAPE SCore:  6.982312583162025
XGBoost SMAPE SCore:  0.16251925820813873
LightGBM SMAPE SCore:  0.15663260385277206
CatBoost SMAPE SCore:  0.15008457924327337


## **Feature Extraction Technique with different Models**

In [25]:
for model_name, model in zip(model_names, models):

  Pred=[]
  actuals=[]
  # Loop through datasets
  for i in os.listdir("/content/drive/MyDrive/filtered"):
    path="/content/drive/MyDrive/filtered/"+i
    # Read the dataset
    df=pd.read_csv(path)
    df=df.iloc[:-1,:]
    df.ffill(inplace=True)
    df["Date"]=pd.to_datetime(df["Date"], format="%d.%m.%Y %H:%M:%S   ")
    df.sort_values(by="Date", inplace=True)
    # Feature Extraction
    df["Day"]=df["Date"].dt.day
    df["Dayofweek"]=df["Date"].dt.dayofweek
    df["Month"]=df["Date"].dt.month
    df["Year"]=df["Date"].dt.year
    df["Quarter"]=df["Date"].dt.quarter
    df["Week"]=df["Date"].dt.isocalendar().week
    df=df.drop("Date", axis=1)
    # Split dataset to train and test
    train=df.iloc[:-26,:]
    test=df.iloc[-26:, :]
    # Split dataset to dependent and independent features
    X_train=train.drop("Levels", axis=1)
    y_train=train["Levels"]
    X_test=test.drop("Levels", axis=1)
    y_test=test["Levels"]
    # Train model
    model.fit(X_train, y_train)
    pred=model.predict(X_test)
    # Make Predictions
    for yss in y_test.values:
      actuals.append(yss)
    for yss in pred:
      Pred.append(yss)
 # Test the models
  test=pd.DataFrame({"Predictions":Pred, "Actuals":actuals}).dropna()

  A = test["Actuals"]
  F = test["Predictions"]
  print(model_name+" SMAPE SCore: ", smape(A, F))

Decision Tree SMAPE SCore:  0.13657110222850494
Random Forest SMAPE SCore:  0.12634429688212107
Linear Regression SMAPE SCore:  0.17342492768150092
Gradient Boosting SMAPE SCore:  0.12953045383309442
Neural Network SMAPE SCore:  2.4405739496895755
XGBoost SMAPE SCore:  0.13342506972787024
LightGBM SMAPE SCore:  0.12082219218344961
CatBoost SMAPE SCore:  0.13083925580895717


**Feature Extraction with Light GBM is the best performing methodology**