In [169]:
import datetime
from collections import OrderedDict, deque
import math
import numpy as np
import json, codecs

In [None]:
# Check if numbers are calculated correctly!

In [163]:
datafile = "EURUSD-2018-01.csv"
with open(datafile) as f:
    lines = f.readlines()

In [8]:
def parse_time(text):    
    year = int(text[0:4])
    month = int(text[4:6])
    day = int(text[6:8])
    
    hour = int(text[9:11])
    min = int(text[12:14])
    sec = int(text[15:17])
    return datetime.datetime(year, month, day, hour, min, sec)

In [63]:
time = []
price = []
for line in lines:
    line_split = line.split(",")
    price.append(0.5*(float(line_split[2]) + float(line_split[3]))) # Ask-Bid-Mitte
    time.append(parse_time(line_split[1]))

In [44]:
def map_datetime(dt):
    dt0 = dt
    return dt0.replace(minute=(dt.minute // 15) * 15, second=0)

In [45]:
# determine intial_date
initial_date = map_datetime(time[0])

In [46]:
initial_date, time[0], str(initial_date)

(datetime.datetime(2018, 1, 1, 22, 0),
 datetime.datetime(2018, 1, 1, 22, 1, 1),
 '2018-01-01 22:00:00')

In [47]:
reduced_time = list(map(lambda a: map_datetime(a), time))

In [64]:
# define buckets of prices [(t1_str, [(t1, p1), (t2, p2), (t3, p3)]), (t2_str, [(t4, p4), (t5, p5), (t6, p6)]), ...]

buckets = OrderedDict()
for t, p in zip(time, price):
    printed_time = str(map_datetime(t))
    if printed_time not in buckets:
        buckets[printed_time] = []
        
    buckets[printed_time].append((t, p))

In [65]:
def get_ohlc(bucket):
    o, c = bucket[0], bucket[-1]
    h = max(bucket, key=lambda a: a[1])
    l = min(bucket, key=lambda a: a[1])
    return o, h, l, c

In [66]:
# calculate ohlc data

ohlc = OrderedDict()
for t, bucket in buckets.items():
    ohlc[t] = get_ohlc(bucket)

In [90]:
list(ohlc.items())[0:5]

[('2018-01-01 22:00:00',
  ((datetime.datetime(2018, 1, 1, 22, 1, 1), 1.20105),
   (datetime.datetime(2018, 1, 1, 22, 9, 25), 1.20123),
   (datetime.datetime(2018, 1, 1, 22, 7, 37), 1.200465),
   (datetime.datetime(2018, 1, 1, 22, 14, 54), 1.201085))),
 ('2018-01-01 22:15:00',
  ((datetime.datetime(2018, 1, 1, 22, 15, 2), 1.2010450000000001),
   (datetime.datetime(2018, 1, 1, 22, 15, 5), 1.20121),
   (datetime.datetime(2018, 1, 1, 22, 18, 13), 1.20059),
   (datetime.datetime(2018, 1, 1, 22, 29, 58), 1.200745))),
 ('2018-01-01 22:30:00',
  ((datetime.datetime(2018, 1, 1, 22, 30, 3), 1.2007349999999999),
   (datetime.datetime(2018, 1, 1, 22, 36, 30), 1.2016550000000001),
   (datetime.datetime(2018, 1, 1, 22, 30, 21), 1.20069),
   (datetime.datetime(2018, 1, 1, 22, 44, 4), 1.201505))),
 ('2018-01-01 22:45:00',
  ((datetime.datetime(2018, 1, 1, 22, 45, 4), 1.201495),
   (datetime.datetime(2018, 1, 1, 22, 59, 3), 1.2016),
   (datetime.datetime(2018, 1, 1, 22, 50, 6), 1.201415),
   (datetime

In [92]:
closing = list(map(lambda t_v: (t_v[0], t_v[1][3][1]), ohlc.items()))

In [93]:
closing[0:5]

[('2018-01-01 22:00:00', 1.201085),
 ('2018-01-01 22:15:00', 1.200745),
 ('2018-01-01 22:30:00', 1.201505),
 ('2018-01-01 22:45:00', 1.20147),
 ('2018-01-01 23:00:00', 1.201485)]

In [123]:
# calculate 8-delayed-log-returns of closing prices
n = 8
log_returns = []
lag = deque()
last_price = None
for t, v in closing:
    if last_price is not None:
        lag.append(math.log(v / last_price))
        while len(lag) > n:
            lag.popleft()
            
        if len(lag) == n:
            log_returns.append((t, list(lag)))
        
    last_price = v

In [126]:
len(log_returns) % 96

0

In [132]:
log_returns[0:2]

[('2018-01-02 00:00:00',
  [-0.0002831174581640054,
   0.0006327401605630536,
   -2.9130556915841128e-05,
   1.2484628301531433e-05,
   -8.323379962920257e-05,
   -0.0003246780304751021,
   7.909795036377992e-05,
   0.00015817713374405878]),
 ('2018-01-02 00:15:00',
  [0.0006327401605630536,
   -2.9130556915841128e-05,
   1.2484628301531433e-05,
   -8.323379962920257e-05,
   -0.0003246780304751021,
   7.909795036377992e-05,
   0.00015817713374405878,
   0.00014982894556763034])]

In [147]:
z_score_clusters = OrderedDict()
for n, t_vs in enumerate(log_returns):
    i = n // 96
    if i not in z_score_clusters:
        z_score_clusters[i] = []
        
    z_score_clusters[i].append(t_vs[1])

In [140]:
def calc_z_scores_parameters(cluster):
    cluster0 = np.asarray(cluster)
    mean = np.mean(cluster0, axis=0)
    variance = np.var(cluster0, axis=0)
    return mean, variance

In [187]:
def z_transform(value, mean, variance):
    result = (np.asarray(value) - mean) / variance
    return result.tolist()

In [188]:
z_score_transformed = []
for n, t_vs in enumerate(log_returns):
    i = n // 96
    mean, variance = calc_z_scores_parameters(z_score_clusters[i])
    z_score_transformed.append([t_vs[0], z_transform(t_vs[1], mean, variance)])

In [189]:
z_score_transformed

[['2018-01-02 00:00:00',
  [-2646.390659110688,
   4886.651563709774,
   -560.9552471225117,
   -209.75597567873007,
   -1046.9016386631713,
   -3118.2056347238204,
   269.6761650199287,
   970.5662280906295]],
 ['2018-01-02 00:15:00',
  [4868.923618417833,
   -593.5133618082493,
   -205.616132729908,
   -1027.352257874837,
   -3096.245213457661,
   301.2866983177711,
   947.5417263297229,
   899.6779846482139]],
 ['2018-01-02 00:30:00',
  [-562.2333116961797,
   -248.94746340158142,
   -1022.9259977298724,
   -3089.6918696385833,
   330.94668543114614,
   970.9913714641291,
   875.9811838172707,
   -1432.7952996898293]],
 ['2018-01-02 00:45:00',
  [-220.7487957287268,
   -1041.478025064418,
   -3084.5431385847387,
   359.23377520014895,
   1002.1593120182105,
   900.292353150119,
   -1478.6131116193155,
   1677.028482679365]],
 ['2018-01-02 01:00:00',
  [-1006.1919184820788,
   -3040.590798009486,
   363.1742918400954,
   1034.7029374977917,
   931.3011028963771,
   -1425.954756926889

In [156]:
log_returns

[('2018-01-02 00:00:00',
  [-0.0002831174581640054,
   0.0006327401605630536,
   -2.9130556915841128e-05,
   1.2484628301531433e-05,
   -8.323379962920257e-05,
   -0.0003246780304751021,
   7.909795036377992e-05,
   0.00015817713374405878]),
 ('2018-01-02 00:15:00',
  [0.0006327401605630536,
   -2.9130556915841128e-05,
   1.2484628301531433e-05,
   -8.323379962920257e-05,
   -0.0003246780304751021,
   7.909795036377992e-05,
   0.00015817713374405878,
   0.00014982894556763034]),
 ('2018-01-02 00:30:00',
  [-2.9130556915841128e-05,
   1.2484628301531433e-05,
   -8.323379962920257e-05,
   -0.0003246780304751021,
   7.909795036377992e-05,
   0.00015817713374405878,
   0.00014982894556763034,
   -0.00012485589564956867]),
 ('2018-01-02 00:45:00',
  [1.2484628301531433e-05,
   -8.323379962920257e-05,
   -0.0003246780304751021,
   7.909795036377992e-05,
   0.00015817713374405878,
   0.00014982894556763034,
   -0.00012485589564956867,
   0.0002413740018628394]),
 ('2018-01-02 01:00:00',
  [-8

In [157]:
(-0.0002831174581640054 - 3.93862932e-05) / 1.21865512e-07

-2646.3906487670233

In [190]:
def save_data_structure(structure, file):
    json.dump(structure, codecs.open(file, 'w', encoding='utf-8'), sort_keys=True, indent=4)

In [191]:
save_data_structure(z_score_transformed, datafile+"-z_score_transformed")

In [184]:
z_score_transformed[0]

['2018-01-02 00:00:00',
 array([-2646.39065911,  4886.65156371,  -560.95524712,  -209.75597568,
        -1046.90163866, -3118.20563472,   269.67616502,   970.56622809])]