In [1]:
import pandas as pd
import pathlib
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from xgboost import XGBRegressor
root_dir = pathlib.Path(r"./datasets")
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [2]:
# Creating lag features for time-series data

def create_lag_features(data, column, lag_steps=3):
    for i in range(1, lag_steps + 1):
        data[f'lag_{i}'] = data[column].shift(i)

    return data


# Creating rolling mean for time-series data

def create_rolling_mean(data, column, window_size=3):
    data['rolling_mean'] = data[column].rolling(window=window_size).mean()

    return data


# Applying Fourier transformation for capturing seasonality

import numpy as np
from scipy.fft import fft


def apply_fourier_transform(data, column):
    values = data[column].values

    fourier_transform = fft(values)

    data['fourier_transform'] = np.abs(fourier_transform)

    return data


def split(data, target, train_size=.8):
    # Splitting time-series data into training and testing sets

    train_size = int(len(data) * train_size)
    data = create_lag_features(data, target)
    data = create_rolling_mean(data, target)
    data = apply_fourier_transform(data, target)
    data_train, data_test = data[:train_size], data[train_size:]
    y_train = data_train[target]
    y_test = data_test[target]
    X_train = data_train.drop(columns=[target])
    X_test = data_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test


In [3]:
!ls .

README.md        eda.ipynb        model.ipynb      requirements.txt
[34mdatasets[m[m         main.py          [34mmodels[m[m


In [5]:
dfX = pd.read_excel(root_dir / "online_retail" / "online_retail.xlsx")

In [6]:
df = dfX.copy()
# df.dropna(inplace=True)
# df.drop(columns=["COMMODITY_DESC", "SOURCE_ID", "ATTRIBUTE_DESC"], inplace=True)
# df.sort_values(by=['YEAR_ID','TIMEPERIOD_ID'], inplace=True)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [15]:
df.Description.unique().shape, df.shape

((4224,), (541909, 8))

In [9]:
X_train, X_test, y_train, y_test = split(df, "AMOUNT")


# create model instance
xgb_model = XGBRegressor(eval_metric=["rmse", "mae", "mape"], enable_categorical=True, random_state=47)
# fit model
xgb_model.fit(X_train, y_train)
# make predictions
y_pred = xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)
print(dict(mse=mse, rmse=rmse, mae=mae, mape=mape))
# Displaying feature importance scores
print("feature_importance", xgb_model.feature_importances_)

{'mse': 5357.06769751149, 'rmse': 73.19199203131099, 'mae': 20.18776625117554, 'mape': 0.18307692639181614}
feature_importance [0.01638273 0.01484101 0.0229726  0.07031856 0.01387662 0.39997533
 0.0455769  0.07927816 0.07821157 0.01330098 0.23668735 0.00857826]
