#### Proof of concept

Идея:

* Создадим разные секретные модели. Даже не обязательно, чтобы они показывали высокую точность.
* Создадим одну модель для остатков.
* Убедимся в том, что локлаьные модели для остатков повышают скор соответсвующих секретных моделей.
* Убедимся в том, что модель для остатоков получает буст от федеративного обучения.



In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import Markdown as md
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import datetime
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

import warnings

warnings.filterwarnings("ignore")


df = pd.read_csv("Datasets/all_stocks_5yr.csv")
df["date"] = pd.to_datetime(df["date"])

# https://www.kaggle.com/code/avikumart/timeseries-stock-price-analysis-forecasting/input
df = df[df.Name == "AAPL"]

In [54]:
y = df.close

In [55]:
y_true = df_test.close
y_pred = df_test.holt

In [56]:
y_pred

1922    114.802663
1923    112.649499
1924    109.108017
1925    109.678271
1926    109.280410
           ...    
2293    140.197269
2294    141.404096
2295    140.109928
2296    141.291362
2297    141.053439
Name: holt, Length: 376, dtype: float64

In [57]:
y_true

1922    112.44
1923    109.06
1924    110.30
1925    109.58
1926    110.38
         ...  
2293    141.46
2294    139.84
2295    141.42
2296    140.92
2297    140.64
Name: close, Length: 376, dtype: float64

In [58]:
y_pred

1922    114.802663
1923    112.649499
1924    109.108017
1925    109.678271
1926    109.280410
           ...    
2293    140.197269
2294    141.404096
2295    140.109928
2296    141.291362
2297    141.053439
Name: holt, Length: 376, dtype: float64

In [59]:
cur

1922    112.44
1923    109.06
1924    110.30
1925    109.58
1926    110.38
         ...  
2292    139.99
2293    141.46
2294    139.84
2295    141.42
2296    140.92
Name: close, Length: 375, dtype: float64

In [60]:
c = len(y_true) - 1
cur = y_true[:c]
next = y_true.shift(-1)[:c]

next-cur

1922   -3.38
1923    1.24
1924   -0.72
1925    0.80
1926    0.40
        ... 
2292    1.47
2293   -1.62
2294    1.58
2295   -0.50
2296   -0.28
Name: close, Length: 375, dtype: float64

In [61]:
DAILY = None
def return_metric(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    c = len(y_true) - 1
    cur = y_true[:c]
    next = y_true.shift(-1)[:c]
    daily_return = (next - cur) * np.sign(y_pred[:-1].values - cur.values)
    global DAILY
    DAILY = daily_return
    # print(daily_return)
    return sum(daily_return)#, daily_return

In [62]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.date, y=df.close, name="close"))
# fig.update_layout(    xaxis_title="№ month")

In [63]:
10_000*(1.01**365)

377834.3433288728

In [64]:
d = datetime.date(2015, 9, 27)
test_finish = datetime.date(2017, 3, 27)
df_train = df[df.date.dt.date < d]
df_test = df[(df.date.dt.date >= d) & (df.date.dt.date < test_finish)]

In [65]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))

In [66]:
df["next"] = df.close.shift(-1)
df["prev"] = df.close.shift(1)

In [67]:
from sklearn.metrics import mean_absolute_error as mae

### Prev day

In [68]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index].prev),
    return_metric(df.loc[df_test.index].close, df.loc[df_test.index].prev),
)

(1.0673138297872335, 24.370000000000005)

In [69]:
df["ema_0.05"] = df.close.ewm(alpha=0.05).mean()
df["ema_0.1"] = df.close.ewm(alpha=0.1).mean()
df_test['ema_0.1'] = df["ema_0.1"].loc[df_test.index]

In [70]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.05"]),
    return_metric(df.loc[df_test.index].close,
                  df.loc[df_test.index]["ema_0.05"]),
)

(4.43656863979577, -46.279999999999944)

In [71]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.1"]),
    return_metric(df.loc[df_test.index].close,
                  df.loc[df_test.index]["ema_0.1"]),
)

(2.9081286494513785, -60.11999999999989)

In [72]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(
    go.Scatter(x=df_test.date,
               y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1")
)

### Holt

In [73]:
def seq_fit_predict():
    pred = []
    for i in range(len(df_train), len(df_train) + len(df_test)):
        fit3 = Holt(
            df.close.iloc[:i], damped_trend=True, initialization_method="estimated"
        ).fit(smoothing_level=0.8, smoothing_trend=0.2)
        fcast3 = fit3.forecast(1)
        pred.append(fcast3.iloc[-1])
    return pred


holt_pred = seq_fit_predict()
df_test["holt"] = holt_pred

In [74]:
(mae(df_test.close, df_test.holt), return_metric(df_test.close, df_test.holt))

(1.078452891452233, 14.859999999999928)

In [75]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date, y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(
    go.Scatter(x=df_test.date, y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1")
)
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.holt, name="Holt"))

In [76]:
df_test

Unnamed: 0,date,open,high,low,close,volume,Name,ema_0.1,holt
1922,2015-09-28,113.850,114.5700,112.44,112.44,52109011,AAPL,113.711504,114.802663
1923,2015-09-29,112.830,113.5100,107.86,109.06,73365384,AAPL,113.246353,112.649499
1924,2015-09-30,110.170,111.5400,108.73,110.30,66473033,AAPL,112.951718,109.108017
1925,2015-10-01,109.070,109.6200,107.31,109.58,63929100,AAPL,112.614546,109.678271
1926,2015-10-02,108.010,111.0136,107.55,110.38,58019758,AAPL,112.391091,109.280410
...,...,...,...,...,...,...,...,...,...
2293,2017-03-20,140.400,141.5000,140.23,141.46,21542038,AAPL,138.306430,140.197269
2294,2017-03-21,142.110,142.8000,139.73,139.84,39529912,AAPL,138.459787,141.404096
2295,2017-03-22,139.845,141.6000,139.76,141.42,25860165,AAPL,138.755809,140.109928
2296,2017-03-23,141.260,141.5844,140.61,140.92,20346301,AAPL,138.972228,141.291362


In [77]:
t = df.loc[df_train.index]  # .drop(['date', 'Name'], axis=1)
cols = ['open', 'high', 'low', 'close', 'volume']
cols = ['close']
X = t[cols]
y = t.next

In [78]:
from sklearn import svm
regr = svm.SVR()
regr.fit(X, y)
df_test['svm'] = regr.predict(df_test[cols])
return_metric(df_test.close, df_test.svm)

23.1400000000001

In [79]:
X

Unnamed: 0,close
1259,67.8542
1260,68.5614
1261,66.8428
1262,66.7156
1263,66.6556
...,...
1917,115.2100
1918,113.4000
1919,114.3200
1920,115.0000


In [80]:
from sklearn.linear_model import BayesianRidge
regr = BayesianRidge()
regr.fit(X, y)
df_test['bayes'] = regr.predict(df_test[cols])
return_metric(df_test.close, df_test.bayes)

4.420000000000016

In [82]:
from sklearn.linear_model import LinearRegression
lin_regr = LinearRegression()
lin_regr.fit(X, y)
df_test['linear'] = lin_regr.predict(df_test[cols])
return_metric(df_test.close, df_test.linear)

4.420000000000016

In [83]:
lin_regr.coef_

array([0.99790193])

In [84]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.holt, name="Holt"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.svm, name="svm"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.bayes, name="bayes"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.linear, name="linear"))

In [85]:
# TODO standartise

In [86]:
r_holt = df_test.close - df_test.holt
r_linear = df_test.close - df_test.linear
r_bayes = df_test.close - df_test.bayes
r_ema = df_test.close - df_test['ema_0.1']

In [87]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test.date, y=r_holt, name="Holt"))
fig.add_trace(go.Scatter(x=df_test.date, y=r_linear, name="linear"))
fig.add_trace(go.Scatter(x=df_test.date, y=r_bayes, name="bayes"))
fig.add_trace(go.Scatter(x=df_test.date, y=r_ema, name="ema"))