## Regression

In [20]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression

In [21]:
data = pd.read_csv(filepath_or_buffer='../../resources/intraday.csv', parse_dates=['time'], index_col='time')

In [22]:
data.head(10)

Unnamed: 0_level_0,Close
time,Unnamed: 1_level_1
2018-01-01 22:00:00+00:00,1.201205
2018-01-02 04:00:00+00:00,1.207055
2018-01-02 10:00:00+00:00,1.20444
2018-01-02 16:00:00+00:00,1.2058
2018-01-02 22:00:00+00:00,1.20469
2018-01-03 04:00:00+00:00,1.203825
2018-01-03 10:00:00+00:00,1.202355
2018-01-03 16:00:00+00:00,1.201445
2018-01-03 22:00:00+00:00,1.20145
2018-01-04 04:00:00+00:00,1.2043


In [23]:
data.tail(10)

Unnamed: 0_level_0,Close
time,Unnamed: 1_level_1
2019-12-26 16:00:00+00:00,1.109655
2019-12-26 22:00:00+00:00,1.11189
2019-12-27 04:00:00+00:00,1.11386
2019-12-27 10:00:00+00:00,1.1163
2019-12-27 16:00:00+00:00,1.11758
2019-12-29 22:00:00+00:00,1.11992
2019-12-30 04:00:00+00:00,1.11994
2019-12-30 10:00:00+00:00,1.120095
2019-12-30 16:00:00+00:00,1.11992
2019-12-30 22:00:00+00:00,1.120355


In [24]:
data.Close.to_frame()

Unnamed: 0_level_0,Close
time,Unnamed: 1_level_1
2018-01-01 22:00:00+00:00,1.201205
2018-01-02 04:00:00+00:00,1.207055
2018-01-02 10:00:00+00:00,1.204440
2018-01-02 16:00:00+00:00,1.205800
2018-01-02 22:00:00+00:00,1.204690
...,...
2019-12-29 22:00:00+00:00,1.119920
2019-12-30 04:00:00+00:00,1.119940
2019-12-30 10:00:00+00:00,1.120095
2019-12-30 16:00:00+00:00,1.119920


Getting more statistical insight into the dataset

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2070 entries, 2018-01-01 22:00:00+00:00 to 2019-12-30 22:00:00+00:00
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   2070 non-null   float64
dtypes: float64(1)
memory usage: 32.3 KB


In [26]:
data.describe()

Unnamed: 0,Close
count,2070.0
mean,1.1504
std,0.041544
min,1.088635
25%,1.119714
50%,1.137698
75%,1.16917
max,1.25392


Since the only attribute we need is closing price
we modify the data frame


In [27]:
data = data.Close.to_frame()

In [28]:
data.head(10)

Unnamed: 0_level_0,Close
time,Unnamed: 1_level_1
2018-01-01 22:00:00+00:00,1.201205
2018-01-02 04:00:00+00:00,1.207055
2018-01-02 10:00:00+00:00,1.20444
2018-01-02 16:00:00+00:00,1.2058
2018-01-02 22:00:00+00:00,1.20469
2018-01-03 04:00:00+00:00,1.203825
2018-01-03 10:00:00+00:00,1.202355
2018-01-03 16:00:00+00:00,1.201445
2018-01-03 22:00:00+00:00,1.20145
2018-01-04 04:00:00+00:00,1.2043


Since the only attribute we need is closing price
we modify the data frame


In [29]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data.index, y=data.Close, name='Close'))

fig.update_layout(title='EUR/USD', xaxis_title='Time', yaxis_title='Price')

fig.show()

In [30]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data.loc['2018'].index, y=data.loc['2018'].Close, name='Close'))

fig.update_layout(title='EUR/USD', xaxis_title='Time', yaxis_title='Price')

fig.show()

We proceed to define baseline and compute return

In [31]:
data['returns'] = np.log(data.div(data.shift(1)))

In [32]:
data.head(10)

Unnamed: 0_level_0,Close,returns
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 22:00:00+00:00,1.201205,
2018-01-02 04:00:00+00:00,1.207055,0.004858
2018-01-02 10:00:00+00:00,1.20444,-0.002169
2018-01-02 16:00:00+00:00,1.2058,0.001129
2018-01-02 22:00:00+00:00,1.20469,-0.000921
2018-01-03 04:00:00+00:00,1.203825,-0.000718
2018-01-03 10:00:00+00:00,1.202355,-0.001222
2018-01-03 16:00:00+00:00,1.201445,-0.000757
2018-01-03 22:00:00+00:00,1.20145,4e-06
2018-01-04 04:00:00+00:00,1.2043,0.002369


Predicting returns using linear regression

In [33]:
data['lag1'] = data.returns.shift(1)

In [34]:
data.dropna(inplace=True)

In [35]:
data

Unnamed: 0_level_0,Close,returns,lag1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02 10:00:00+00:00,1.204440,-0.002169,0.004858
2018-01-02 16:00:00+00:00,1.205800,0.001129,-0.002169
2018-01-02 22:00:00+00:00,1.204690,-0.000921,0.001129
2018-01-03 04:00:00+00:00,1.203825,-0.000718,-0.000921
2018-01-03 10:00:00+00:00,1.202355,-0.001222,-0.000718
...,...,...,...
2019-12-29 22:00:00+00:00,1.119920,0.002092,0.001146
2019-12-30 04:00:00+00:00,1.119940,0.000018,0.002092
2019-12-30 10:00:00+00:00,1.120095,0.000138,0.000018
2019-12-30 16:00:00+00:00,1.119920,-0.000156,0.000138


In [36]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data.returns, y=data.lag1, mode='markers'))

fig.update_layout(title='EUR/USD', xaxis_title='lag1', yaxis_title='Returns')

fig.show()
