In [None]:
from typing import List, Dict, cast
import datetime
import pandas as pd
from data import SCRAPED_STOCK_FILE_PATH 
from gsp.model.utils import make_shift_in_groups, make_mw_in_groups, MOVING_WINDOW_AGGREGATORS_ALIAS
import logging 

logger = logging.getLogger(__name__)

GLOBAL PARAMETERS - SETTINGS

In [None]:
# --- MAIN SETUP ---
MAIN_INDEX = ["Date", "Name"]
SECONDARY_INDEX = ["Name"]
YEARS_BACK_TO_CONSIDER = 5
N_STEP_PREDICTION = 15
DAYS_BACK_TO_TEST = 3*N_STEP_PREDICTION
TODAY = datetime.datetime.now().date()
DATA_START_DATE = TODAY - pd.DateOffset(years=YEARS_BACK_TO_CONSIDER)


# --- TRAINING & TESTING SETUP ---
TRAINING_END_DATE = TODAY - pd.DateOffset(days=DAYS_BACK_TO_TEST)
TESTING_START_DATE = TRAINING_END_DATE + pd.DateOffset(days=N_STEP_PREDICTION)
TESTING_END_DATE = TODAY - pd.DateOffset(days=N_STEP_PREDICTION)

LOAD DATA

In [None]:
stocks = (
    pd.read_csv(
        SCRAPED_STOCK_FILE_PATH,
        dtype={
            "Date": "period[D]",
            "Open": "float",
            "High": "float",
            "Low": "float",
            "Close": "float",
            "Volume": "int",
            "Area": "category",
            "Name": "category",
        },
        index_col=MAIN_INDEX,
    )
    .sort_index()
    .loc[DATA_START_DATE:]
)

stocks

PREPROCESSING

In [None]:
def preprocessX(
    data: pd.DataFrame,
    shifts_list: List[int],
    windows_list: List[int],
    aggregators_list: List[MOVING_WINDOW_AGGREGATORS_ALIAS] | MOVING_WINDOW_AGGREGATORS_ALIAS,
) -> pd.DataFrame:
    data = data.copy()

    data["DayOfWeek"] = data.index.get_level_values("Date").dayofweek # type: ignore
    data["Month"] = data.index.get_level_values("Date").month # type: ignore
    data["Year"] = data.index.get_level_values("Date").year # type: ignore
    data["IsMonthStart"] = data.index.get_level_values("Date").day < 5 # type: ignore
    data["IsMonthEnd"] = data.index.get_level_values("Date").days_in_month - data.index.get_level_values("Date").day < 5 # type: ignore
    data["Quarter"] = data.index.get_level_values("Date").quarter # type: ignore
    data["NameCat"] = data.index.get_level_values("Name")
    # Prodcut lunches from those comapnies

    grouped_lags: List[pd.DataFrame | pd.Series] = [
        make_shift_in_groups(df=data, groupby=SECONDARY_INDEX, column="Close", shift=shifts_list),
        make_mw_in_groups(df=data, groupby=SECONDARY_INDEX, column="Close", window=windows_list, aggregator=aggregators_list),
    ]

    data_lag = data.join(grouped_lags)

    categorical_features = ["Area","NameCat","DayOfWeek", "Month", "Year","IsMonthStart", "IsMonthEnd", "Quarter"]

    data_encoded = pd.get_dummies(data_lag, columns=categorical_features, drop_first=True)

    return data_encoded.drop(columns=["Open", "High", "Low", "Adj Close"])


def preprocessY(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()
    return make_shift_in_groups(df = data, groupby=SECONDARY_INDEX, column="Close", shift=[-i for i in range(1,N_STEP_PREDICTION+1)])

X = preprocessX(
    stocks, 
    shifts_list=[1, 2, 3, 4, 8, 16, 24, 32], 
    windows_list=[4, 8, 16, 24, 32], 
    aggregators_list="mean"
)
y = preprocessY(stocks)

In [None]:
X
 

In [None]:
y

TRAINING

In [None]:
# make training date as everything that was until today - DAYS_BACK_TO_TEST

X_train, y_train = X.loc[:TRAINING_END_DATE] , y.loc[:TRAINING_END_DATE]
X_test, y_test = X.loc[TESTING_START_DATE:TESTING_END_DATE] , y.loc[TESTING_START_DATE:TESTING_END_DATE]

display(X_train, y_train, X_test, y_test)

#  --- NOTICE ---
"""
There is a bug with predictions dates.

It predicts for next 15 WORKING DAYS.
When we select the data we include selecting ALL days, not only working days.
To solve it we would need not to include FREE days in calculating the
"""
