#### Proof of concept

Идея:

* Создадим разные секретные модели. Даже не обязательно, чтобы они показывали высокую точность.
* Создадим одну модель для остатков.
* Убедимся в том, что локлаьные модели для остатков повышают скор соответсвующих секретных моделей.
* Убедимся в том, что модель для остатоков получает буст от федеративного обучения.



In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import Markdown as md
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import datetime
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

import warnings

warnings.filterwarnings("ignore")


df = pd.read_csv("Datasets/all_stocks_5yr.csv")
df["date"] = pd.to_datetime(df["date"])

# https://www.kaggle.com/code/avikumart/timeseries-stock-price-analysis-forecasting/input
df = df[df.Name == "AAPL"]

In [33]:
df

Unnamed: 0,date,open,high,low,close,volume,Name
1259,2013-02-08,67.7142,68.4014,66.8928,67.8542,158168416,AAPL
1260,2013-02-11,68.0714,69.2771,67.6071,68.5614,129029425,AAPL
1261,2013-02-12,68.5014,68.9114,66.8205,66.8428,151829363,AAPL
1262,2013-02-13,66.7442,67.6628,66.1742,66.7156,118721995,AAPL
1263,2013-02-14,66.3599,67.3771,66.2885,66.6556,88809154,AAPL
...,...,...,...,...,...,...,...
2513,2018-02-01,167.1650,168.6200,166.7600,167.7800,47230787,AAPL
2514,2018-02-02,166.0000,166.8000,160.1000,160.5000,86593825,AAPL
2515,2018-02-05,159.1000,163.8800,156.0000,156.4900,72738522,AAPL
2516,2018-02-06,154.8300,163.7200,154.0000,163.0300,68243838,AAPL


In [2]:
y = df.close

In [3]:
# y_true = df_test.close
# y_pred = df_test.holt

In [4]:
DAILY = None
def return_metric(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    c = len(y_true) - 1
    cur = y_true[:c]
    next = y_true.shift(-1)[:c]
    daily_return = (next - cur) * np.sign(y_pred[:-1].values - cur.values)
    global DAILY
    DAILY = daily_return
    # print(daily_return)
    return sum(daily_return)#, daily_return

In [5]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.date, y=df.close, name="close"))
# fig.update_layout(    xaxis_title="№ month")

In [6]:
d = datetime.date(2015, 9, 27)
test_finish = datetime.date(2017, 3, 27)
df_train = df[df.date.dt.date < d]
df_test = df[(df.date.dt.date >= d) & (df.date.dt.date < test_finish)]

In [7]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))

In [8]:
df["next"] = df.close.shift(-1)
df["prev"] = df.close.shift(1)

In [9]:
from sklearn.metrics import mean_absolute_error as mae

### Prev day

In [10]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index].prev),
    return_metric(df.loc[df_test.index].close, df.loc[df_test.index].prev),
)

(1.0673138297872335, 24.370000000000005)

In [11]:
df["ema_0.05"] = df.close.ewm(alpha=0.05).mean()
df["ema_0.1"] = df.close.ewm(alpha=0.1).mean()
df_test['ema_0.1'] = df["ema_0.1"].loc[df_test.index]

In [12]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.05"]),
    return_metric(df.loc[df_test.index].close,
                  df.loc[df_test.index]["ema_0.05"]),
)

(4.43656863979577, -46.279999999999944)

In [13]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.1"]),
    return_metric(df.loc[df_test.index].close,
                  df.loc[df_test.index]["ema_0.1"]),
)

(2.9081286494513785, -60.11999999999989)

In [14]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(
    go.Scatter(x=df_test.date,
               y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1")
)

### Holt

In [15]:
def seq_fit_predict():
    pred = []
    for i in range(len(df_train), len(df_train) + len(df_test)):
        fit3 = Holt(
            df.close.iloc[:i], damped_trend=True, initialization_method="estimated"
        ).fit(smoothing_level=0.8, smoothing_trend=0.2)
        fcast3 = fit3.forecast(1)
        pred.append(fcast3.iloc[-1])
    return pred


holt_pred = seq_fit_predict()
df_test["holt"] = holt_pred

In [16]:
(mae(df_test.close, df_test.holt), return_metric(df_test.close, df_test.holt))

(1.078452891452233, 14.859999999999928)

In [17]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date, y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(
    go.Scatter(x=df_test.date, y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1")
)
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.holt, name="Holt"))

In [18]:
df_test

Unnamed: 0,date,open,high,low,close,volume,Name,ema_0.1,holt
1922,2015-09-28,113.850,114.5700,112.44,112.44,52109011,AAPL,113.711504,114.802663
1923,2015-09-29,112.830,113.5100,107.86,109.06,73365384,AAPL,113.246353,112.649499
1924,2015-09-30,110.170,111.5400,108.73,110.30,66473033,AAPL,112.951718,109.108017
1925,2015-10-01,109.070,109.6200,107.31,109.58,63929100,AAPL,112.614546,109.678271
1926,2015-10-02,108.010,111.0136,107.55,110.38,58019758,AAPL,112.391091,109.280410
...,...,...,...,...,...,...,...,...,...
2293,2017-03-20,140.400,141.5000,140.23,141.46,21542038,AAPL,138.306430,140.197269
2294,2017-03-21,142.110,142.8000,139.73,139.84,39529912,AAPL,138.459787,141.404096
2295,2017-03-22,139.845,141.6000,139.76,141.42,25860165,AAPL,138.755809,140.109928
2296,2017-03-23,141.260,141.5844,140.61,140.92,20346301,AAPL,138.972228,141.291362


In [19]:
t = df.loc[df_train.index]  # .drop(['date', 'Name'], axis=1)
cols = ['open', 'high', 'low', 'close', 'volume']
# cols = ['close']
X = t[cols]
y = t.next

In [20]:
from sklearn import svm
regr = svm.SVR()
regr.fit(X, y)
df_test['svm'] = regr.predict(df_test[cols])
return_metric(df_test.close, df_test.svm)

35.41999999999999

In [21]:
from sklearn.linear_model import BayesianRidge
regr = BayesianRidge()
regr.fit(X, y)
df_test['bayes'] = regr.predict(df_test[cols])
return_metric(df_test.close, df_test.bayes)

-7.189999999999941

In [22]:
shifts = pd.concat([df.close]+[df.close.shift(i) for i in range(1,6)], axis=1,
          keys = ['close']+[f'shift{i}' for i in range(1, 6)])

In [23]:
shifts

Unnamed: 0,close,shift1,shift2,shift3,shift4,shift5
1259,67.8542,,,,,
1260,68.5614,67.8542,,,,
1261,66.8428,68.5614,67.8542,,,
1262,66.7156,66.8428,68.5614,67.8542,,
1263,66.6556,66.7156,66.8428,68.5614,67.8542,
...,...,...,...,...,...,...
2513,167.7800,167.4300,166.9700,167.9600,171.5100,171.11
2514,160.5000,167.7800,167.4300,166.9700,167.9600,171.51
2515,156.4900,160.5000,167.7800,167.4300,166.9700,167.96
2516,163.0300,156.4900,160.5000,167.7800,167.4300,166.97


In [24]:
df_test.index, shifts.index

(Index([1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
        ...
        2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297],
       dtype='int64', length=376),
 Index([1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268,
        ...
        2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517],
       dtype='int64', length=1259))

In [25]:
from sklearn.linear_model import LinearRegression
lin_regr = LinearRegression()
shift_x = shifts.drop('close', axis=1)
shift_y = shifts['close']
lin_regr.fit(shift_x.loc[df_train.index[6:]], shift_y.loc[df_train.index[6:]])
df_test['linear'] = lin_regr.predict(shift_x.loc[df_test.index])
return_metric(df_test.close, df_test.linear)

16.619999999999948

In [26]:
lin_regr.coef_

array([ 1.01318236, -0.03791411,  0.0175765 , -0.04419348,  0.04932033])

In [27]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.holt, name="Holt"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.svm, name="svm"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.bayes, name="bayes"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.linear, name="linear"))

In [28]:
# TODO standartise

In [29]:
r_holt = df_test.close - df_test.holt
r_linear = df_test.close - df_test.linear
r_bayes = df_test.close - df_test.bayes
r_ema = df_test.close - df_test['ema_0.1']

In [30]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test.date, y=r_holt, name="Holt"))
fig.add_trace(go.Scatter(x=df_test.date, y=r_linear, name="linear"))
fig.add_trace(go.Scatter(x=df_test.date, y=r_bayes, name="bayes"))
fig.add_trace(go.Scatter(x=df_test.date, y=r_ema, name="ema"))