In [70]:
from typing import List, Tuple, cast, Any, Literal, Optional
from dataclasses import dataclass
import datetime
import pandas as pd
import logging 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import RegressorChain, MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from xgboost import XGBRegressor
from data import SCRAPED_STOCK_FILE_PATH 
from gsp.model.utils import (
    MOVING_WINDOW_AGGREGATORS_ALIAS, 
    make_shift_in_groups, 
    make_mw_in_groups, 
    get_most_recent_working_date, 
    get_nth_previous_working_date,
    show,
    get_all_missing_stock_names,
    get_minimal_stocks_existence_date,
)

logger = logging.getLogger(__name__)
stock_id = "NVDA"

LOAD RAW DATA

In [50]:
_stocks = (
    pd.read_csv(
        SCRAPED_STOCK_FILE_PATH,
        dtype={
            "Date": "period[D]",
            "Open": "float",
            "High": "float",
            "Low": "float",
            "Close": "float",
            "Volume": "int",
            "Area": "category",
            "Name": "category",
        }
    )
)

GLOBAL PARAMETERS - SETTINGS

In [51]:
# --- MAIN SETUP ---
MAIN_INDEX = ["Date", "Name"]
SECONDARY_INDEX = ["Name"]

YEARS_BACK_TO_CONSIDER = 3
"""Number of years back that we want to consider in predictions."""
N_STEP_PREDICTION = 30
"""Defining a forecastig task."""
DAYS_BACK_TO_TEST = 3*N_STEP_PREDICTION
TODAY = cast(pd.Period, _stocks["Date"].max()).to_timestamp().date()
DATA_STARTING_DATE = TODAY - datetime.timedelta(days=365*YEARS_BACK_TO_CONSIDER)
"""A date indicating from when at ealriest we want to consider stocks data."""
DATA_EXISTENCE_DATE = get_minimal_stocks_existence_date(_stocks[_stocks["Date"] >= DATA_STARTING_DATE.isoformat()])
"""A date indicating the earliest date for which we have data for all stocks."""


# --- TRAINING & TESTING SETUP ---
TRAINING_END_DATE = get_nth_previous_working_date(n=DAYS_BACK_TO_TEST, date=TODAY)
TESTING_START_DATE = get_nth_previous_working_date(n=-N_STEP_PREDICTION, date=TRAINING_END_DATE)
TESTING_END_DATE = get_nth_previous_working_date(n=N_STEP_PREDICTION, date=TODAY)

show(
    f"The furthest date to consider is {DATA_STARTING_DATE}",
    f"Data starts from {DATA_EXISTENCE_DATE} and ends at {TODAY}.",
    f"Training data starts from {DATA_EXISTENCE_DATE} and ends at {TRAINING_END_DATE}.",
    f"Testing data starts from {TESTING_START_DATE} and ends at {TESTING_END_DATE}."
)

'The furthest date to consider is 2021-05-04'

'Data starts from 2021-05-04 and ends at 2024-05-03.'

'Training data starts from 2021-05-04 and ends at 2023-12-29.'

'Testing data starts from 2024-02-09 and ends at 2024-03-22.'

CLEANING DATA

In [52]:
def clean_up_stocks(stocks: pd.DataFrame, main_index:List[str], start_date: datetime.date, existence_date: datetime.date) -> pd.DataFrame:
    missing_stocks_data = get_all_missing_stock_names(_stocks, start_date)
    logger.info(f"Missing stocks data: {missing_stocks_data}")
    logger.info(f"Data existence date: {existence_date}")
    return stocks.copy().set_index(main_index).sort_index().unstack("Name").ffill().stack("Name", future_stack=True).loc[existence_date.isoformat() :]  # type: ignore

PREPROCESSING

In [53]:
def preprocessX(
    data: pd.DataFrame,
    secondary_index: List[str],
    shifts_list: List[int],
    windows_list: List[int],
    aggregators_list: List[MOVING_WINDOW_AGGREGATORS_ALIAS] | MOVING_WINDOW_AGGREGATORS_ALIAS,
) -> pd.DataFrame:
    data = data.copy()

    data["DayOfWeek"] = data.index.get_level_values("Date").dayofweek # type: ignore
    data["Month"] = data.index.get_level_values("Date").month # type: ignore
    data["Year"] = data.index.get_level_values("Date").year # type: ignore
    data["IsMonthStart"] = data.index.get_level_values("Date").day < 5 # type: ignore
    data["IsMonthEnd"] = data.index.get_level_values("Date").days_in_month - data.index.get_level_values("Date").day < 5 # type: ignore
    data["Quarter"] = data.index.get_level_values("Date").quarter # type: ignore
    data["NameCat"] = data.index.get_level_values("Name")
    # Prodcut lunches from those comapnies

    grouped_lags: List[pd.DataFrame | pd.Series] = [
        make_shift_in_groups(df=data, groupby=secondary_index, column="Close", shift=shifts_list),
        make_mw_in_groups(df=data, groupby=secondary_index, column="Close", window=windows_list, aggregator=aggregators_list),
    ]

    data_lag = data.join(grouped_lags)

    categorical_features = ["Area","NameCat","DayOfWeek", "Month", "Year","IsMonthStart", "IsMonthEnd", "Quarter"]

    data_encoded = pd.get_dummies(data_lag, columns=categorical_features, drop_first=True)

    return data_encoded.drop(columns=["Open", "High", "Low", "Adj Close", "Volume"])


def preprocessY(data: pd.DataFrame, secondary_index: List[str], n_step_prediction:int) -> pd.DataFrame:
    data = data.copy()
    return make_shift_in_groups(df = data, groupby=secondary_index, column="Close", shift=[-i for i in range(1,n_step_prediction+1)])


SPLITTING DATA

In [54]:
def split_data(X: pd.DataFrame, y: pd.DataFrame, training_end_date: datetime.date, testing_start_date: datetime.date, testing_end_date: datetime.date) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X_train, y_train = (X.loc[: training_end_date.isoformat()], y.loc[: training_end_date.isoformat()])
    X_test, y_test = (
        X.loc[testing_start_date.isoformat() : testing_end_date.isoformat()],
        y.loc[testing_start_date.isoformat() : testing_end_date.isoformat()],
    )

    return X_train, y_train, X_test, y_test

MODELLING

In [55]:
def create_model(**hyper_params):
    # model = LinearRegression(**hyper_params)
    model = RegressorChain(XGBRegressor(**hyper_params))
    return model 

TRAININING & TESTING

In [56]:
def fit_predict(model, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame) -> pd.DataFrame:
    logger.info("Fitting the model...")
    model = model.fit(X_train, y_train)
    logger.info("Predicting...")
    y_pred = pd.DataFrame(data=model.predict(X_test), index=X_test.index, columns=y_train.columns)
    return y_pred

In [57]:
def plot_multistep(y, every=1, ax=None, palette_kwargs=None):
    palette_kwargs_ = dict(palette="husl", n_colors=16, desat=None)
    if palette_kwargs is not None:
        palette_kwargs_.update(palette_kwargs)
    palette = sns.color_palette(**palette_kwargs_)
    if ax is None:
        fig, ax = plt.subplots()
    ax.set_prop_cycle(plt.cycler("color", palette))
    for date, preds in y[::every].iterrows():
        preds.index = pd.period_range(start=date, periods=len(preds))
        preds.plot(ax=ax)
    return ax

In [82]:
@dataclass
class ColumnTransformerWrapper:
    transformers : List[Tuple[str, Any, List[str]]]
    remainder : Literal["drop", "passthrough"] = "passthrough"

    def fit_transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame: # type: ignore
        ct = ColumnTransformer(self.transformers, remainder=self.remainder)

        return pd.DataFrame(
            ct.fit_transform(X, y), # type: ignore
            index=X.index,
            columns=[col.replace("remainder__","") for col in ct.get_feature_names_out()],
        )


def demo(stocks: pd.DataFrame,
    model,
    data_starting_date : datetime.date = DATA_STARTING_DATE,
    data_existence_date: datetime.date = DATA_EXISTENCE_DATE,
    data_training_end_date: datetime.date = TRAINING_END_DATE,
    data_testing_start_date: datetime.date = TESTING_START_DATE,
    data_testing_end_date: datetime.date = TESTING_END_DATE,   
    main_index : List[str]= MAIN_INDEX, 
    secondary_index : List[str] = SECONDARY_INDEX,
    n_step_prediction: int = N_STEP_PREDICTION,
    evaluate_single_problem: bool = False) -> None:

    error_date = datetime.date(year=2024, month=3, day=22)
    leading_date = get_nth_previous_working_date(n=-30, date=error_date)

    logger.info(f"{error_date} is the date of the error, and {leading_date} is the leading date.")

    cln = clean_up_stocks(stocks[stocks["Name"]==stock_id], main_index, data_starting_date, data_existence_date)

    # data = cln.copy()
    # data["DayOfWeek"] = data.index.get_level_values("Date").dayofweek  # type: ignore
    # data["Month"] = data.index.get_level_values("Date").month  # type: ignore
    # data["Year"] = data.index.get_level_values("Date").year  # type: ignore
    # data["IsMonthStart"] = data.index.get_level_values("Date").day < 5  # type: ignore
    # data["IsMonthEnd"] = data.index.get_level_values("Date").days_in_month - data.index.get_level_values("Date").day < 5  # type: ignore
    # data["Quarter"] = data.index.get_level_values("Date").quarter  # type: ignore
    # data["NameCat"] = data.index.get_level_values("Name")
    # categorical_features = ["Area", "NameCat", "DayOfWeek", "Month", "Year", "IsMonthStart", "IsMonthEnd", "Quarter"]
    # ctw = ColumnTransformerWrapper(
    #     transformers=[
    #         ("onehot", OneHotEncoder(drop="first"), categorical_features),
    #     ]
    # )
    # encoded_hot = ctw.fit_transform(data)
    # encoded_final = encoded_hot.drop(columns=["Open", "High", "Low", "Adj Close", "Volume"])

    y = make_shift_in_groups(
        df=cln, groupby=secondary_index, column="Close", shift=[-30]
    )
    y["dow"] = y.index.get_level_values("Date").dayofweek

    logger.info("Demo done")

demo(
    stocks=_stocks,
    model=create_model(),
    data_starting_date=DATA_STARTING_DATE,
    data_existence_date=DATA_EXISTENCE_DATE,
    data_training_end_date=TRAINING_END_DATE,
    data_testing_start_date=TESTING_START_DATE,
    data_testing_end_date=TESTING_END_DATE,
    main_index=MAIN_INDEX,
    secondary_index=SECONDARY_INDEX,
    n_step_prediction=N_STEP_PREDICTION,
    evaluate_single_problem=False,
)

INFO __main__ 18:17:26 | 2024-03-22 is the date of the error, and 2024-05-03 is the leading date.
INFO __main__ 18:17:27 | Missing stocks data: []
INFO __main__ 18:17:27 | Data existence date: 2021-05-04
INFO __main__ 18:19:19 | Demo done


In [None]:
def retest_pipeline(
    stocks: pd.DataFrame,
    model,
    data_starting_date : datetime.date = DATA_STARTING_DATE,
    data_existence_date: datetime.date = DATA_EXISTENCE_DATE,
    data_training_end_date: datetime.date = TRAINING_END_DATE,
    data_testing_start_date: datetime.date = TESTING_START_DATE,
    data_testing_end_date: datetime.date = TESTING_END_DATE,   
    main_index : List[str]= MAIN_INDEX, 
    secondary_index : List[str] = SECONDARY_INDEX,
    n_step_prediction: int = N_STEP_PREDICTION,
    evaluate_single_problem: bool = False
) -> Tuple[float, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # --- CLEAN ---
    stocks_cleaned = clean_up_stocks(stocks, main_index=main_index, start_date=data_starting_date, existence_date=data_existence_date)

    # --- PREPROCESS ---
    X_processed = preprocessX(
        stocks_cleaned, secondary_index=secondary_index, shifts_list=[1,2,3], windows_list=[4, 8, 16, 24], aggregators_list="mean"
    )
    y_processed = preprocessY(stocks_cleaned, secondary_index=secondary_index, n_step_prediction=n_step_prediction)

    # --- ALIGN ---
    y, X = y_processed.align(X_processed.dropna(), join="inner", axis=0)

    # --- SPLIT ---
    X_train, y_train, X_test, y_test = split_data(X, y, training_end_date=data_training_end_date, testing_start_date=data_testing_start_date, testing_end_date=data_testing_end_date)

    # --- TRAIN & TEST ---
    y_pred = pd.DataFrame()
    if not evaluate_single_problem:
        y_pred = fit_predict(model, X_train, y_train, X_test)
        error = cast(float, mean_squared_error(y_test, y_pred))
    else:
        error = 0.0
        unique_names = X.index.get_level_values("Name").unique()
        for name in unique_names:
            X_train_single = X_train.loc[X_train.index.get_level_values("Name") == name]
            y_train_single = y_train.loc[y_train.index.get_level_values("Name") == name]
            X_test_single = X_test.loc[X_test.index.get_level_values("Name") == name]
            y_test_single = y_test.loc[y_test.index.get_level_values("Name") == name]

            y_pred_single = fit_predict(model, X_train_single, y_train_single, X_test_single)
            y_pred = pd.concat([y_pred, y_pred_single], axis=0)
            error += cast(float, mean_squared_error(y_test_single, y_pred_single))

    return error, y_test, y_pred, X, y

error, y_test, y_pred, X, y = retest_pipeline(
    stocks=_stocks,
    model=create_model(),
    data_starting_date=DATA_STARTING_DATE,
    data_existence_date=DATA_EXISTENCE_DATE,
    data_training_end_date=TRAINING_END_DATE,
    data_testing_start_date=TESTING_START_DATE,
    data_testing_end_date=TESTING_END_DATE,
    main_index=MAIN_INDEX,
    secondary_index=SECONDARY_INDEX,
    n_step_prediction=N_STEP_PREDICTION,
    evaluate_single_problem=False,
)

# show(error, y_test, y_pred)

fig = plt.figure(figsize=(12, 6))
y.loc[y.index.get_level_values("Name") == stock_id]["Close_lead_1"].reset_index("Name", drop=True).tail(300).plot(
    ax=fig.gca()
)
plot_multistep(
    y_pred.loc[y_pred.index.get_level_values("Name") == stock_id].reset_index("Name", drop=True), every=8, ax=fig.gca()
)


# 705097.7046092672
# 602875.5865461574
# 336965.0387972004