#### Proof of concept

Идея:

* Создадим разные секретные модели. Даже не обязательно, чтобы они показывали высокую точность.
* Создадим одну модель для остатков.
* Убедимся в том, что локлаьные модели для остатков повышают скор соответсвующих секретных моделей.
* Убедимся в том, что модель для остатоков получает буст от федеративного обучения.



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import Markdown as md
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import datetime
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

import warnings

warnings.filterwarnings("ignore")


df = pd.read_csv("Datasets/all_stocks_5yr.csv")
df["date"] = pd.to_datetime(df["date"])

# https://www.kaggle.com/code/avikumart/timeseries-stock-price-analysis-forecasting/input
df = df[df.Name == "AAPL"]

In [16]:
y = df.close

In [42]:
def return_metric(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    c = len(y_true)-1
    cur = y_true[:c]
    next = y_true.shift(-1)[:c]
    # return y_pred[1:].values-cur.values
    # return (next-cur)
    return sum((next-cur)*np.sign(y_pred[1:].values-cur.values))

In [2]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.date, y=df.close, name="close"))
# fig.update_layout(    xaxis_title="№ month")

In [3]:
d = datetime.date(2015, 9, 27)
test_finish = datetime.date(2017, 3, 27)
df_train = df[df.date.dt.date < d]
df_test = df[(df.date.dt.date >= d) & (df.date.dt.date < test_finish)]

In [4]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))

In [5]:
df["next"] = df.close.shift(-1)
df["prev"] = df.close.shift(1)

In [6]:
from sklearn.metrics import mean_absolute_error as mae

### Prev day

In [45]:
(mae(df.loc[df_test.index].close, df.loc[df_test.index].prev),
 return_metric(df.loc[df_test.index].close, df.loc[df_test.index].prev))

(1.0673138297872335, 0.0)

In [8]:
df["ema_0.05"] = df.close.ewm(alpha=0.05).mean()
df["ema_0.1"] = df.close.ewm(alpha=0.1).mean()

In [46]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.05"]),
    return_metric(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.05"]),
)

(4.43656863979577, -39.499999999999915)

In [47]:
(
    mae(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.1"]),
    return_metric(df.loc[df_test.index].close, df.loc[df_test.index]["ema_0.1"]),
)

(2.9081286494513785, -46.239999999999895)

In [11]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date, y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(
    go.Scatter(x=df_test.date, y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1")
)

### Holt

In [12]:
def seq_fit_predict():
    pred = []
    for i in range(len(df_train), len(df_train)+len(df_test)):
        fit3 = Holt(df.close.iloc[:i], damped_trend=True, initialization_method="estimated").fit(
            smoothing_level=0.8, smoothing_trend=0.2)
        fcast3 = fit3.forecast(1)
        pred.append(fcast3.iloc[-1])
    return pred


holt_pred = seq_fit_predict()
df_test['holt'] = holt_pred

In [48]:
(
    mae(df_test.close, df_test.holt),
    return_metric(df_test.close, df_test.holt)
)

(1.078452891452233, 11.379999999999939)

In [14]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df_test.holt, name="Holt"))

In [56]:
df_test

Unnamed: 0,date,open,high,low,close,volume,Name,holt
1922,2015-09-28,113.850,114.5700,112.44,112.44,52109011,AAPL,114.802663
1923,2015-09-29,112.830,113.5100,107.86,109.06,73365384,AAPL,112.649499
1924,2015-09-30,110.170,111.5400,108.73,110.30,66473033,AAPL,109.108017
1925,2015-10-01,109.070,109.6200,107.31,109.58,63929100,AAPL,109.678271
1926,2015-10-02,108.010,111.0136,107.55,110.38,58019758,AAPL,109.280410
...,...,...,...,...,...,...,...,...
2293,2017-03-20,140.400,141.5000,140.23,141.46,21542038,AAPL,140.197269
2294,2017-03-21,142.110,142.8000,139.73,139.84,39529912,AAPL,141.404096
2295,2017-03-22,139.845,141.6000,139.76,141.42,25860165,AAPL,140.109928
2296,2017-03-23,141.260,141.5844,140.61,140.92,20346301,AAPL,141.291362


In [62]:
t = df.loc[df_train.index]#.drop(['date', 'Name'], axis=1)
cols = ['open', 'high', 'low', 'close','volume']
X = t[cols]
y = t.next

In [70]:
from sklearn import svm
regr = svm.SVR()
regr.fit(X, y)
df_test['svm'] = regr.predict(df_test[cols])
return_metric(df_test.close, df_test.svm)


49.420000000000016

In [74]:
X

Unnamed: 0,open,high,low,close,volume
1259,67.7142,68.4014,66.8928,67.8542,158168416
1260,68.0714,69.2771,67.6071,68.5614,129029425
1261,68.5014,68.9114,66.8205,66.8428,151829363
1262,66.7442,67.6628,66.1742,66.7156,118721995
1263,66.3599,67.3771,66.2885,66.6556,88809154
...,...,...,...,...,...
1917,113.6700,115.3700,113.6600,115.2100,50221965
1918,113.3800,114.1800,112.5201,113.4000,50346159
1919,113.6300,114.7200,113.3000,114.3200,35756716
1920,113.2500,115.5000,112.3700,115.0000,50219475


In [71]:
from sklearn.linear_model import BayesianRidge
regr = BayesianRidge()
regr.fit(X, y)
df_test['bayes'] = regr.predict(df_test[cols])
return_metric(df_test.close, df_test.bayes)


397.1799999999997

In [75]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X, y)
df_test['linear'] = regr.predict(df_test[cols])
return_metric(df_test.close, df_test.linear)


397.3599999999997

In [79]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.date, y=df_train.close, name="Train"))
fig.add_trace(go.Scatter(x=df_test.date, y=df_test.close, name="Test"))
fig.add_trace(go.Scatter(x=df_test.date,              y=df.loc[df_test.index].prev, name="prev day"))
fig.add_trace(go.Scatter(x=df_test.date,
              y=df.loc[df_test.index]["ema_0.1"], name="ema 0.1"))
fig.add_trace(go.Scatter(x=df_test.date,              y=df_test.holt, name="Holt"))
fig.add_trace(go.Scatter(x=df_test.date,              y=df_test.svm, name="svm"))
fig.add_trace(go.Scatter(x=df_test.date,              y=df_test.bayes, name="bayes"))
fig.add_trace(go.Scatter(x=df_test.date,              y=df_test.linear, name="linear"))