In [15]:
import pandas as pd
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time
import tensorflow as tf

In [2]:
main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = f'crypto_data/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
print(main_df.head())  # how did we do??

BTC-USD
LTC-USD
BCH-USD
ETH-USD
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000    

In [3]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "LTC-USD"

In [4]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [5]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)  
# wartosc future z zerowego wiersza jest wartoscia LTC-USD_close z trzeciego wiersza (pierwszy z czwartego itd)

In [6]:
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
#  jezeli wartosc_close < future to zwraca 1, a w przeciwnym razie 0

In [7]:
print(main_df.head(10))  # dane podaja ceny co 60 minut

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   
1528969020    6477.220215        2.731950      96.440002       95.524078   
1528969080    6480.000000        2.174240      96.470001      175.205307   
1528969140    6479.990234        0.903100      96.400002       43.652802   
1528969200    6478.660156        3.258786      96.400002        8.160000   
1528969260    6478.660156        1.970352      96.400002       20.425900   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
time       

# Podział na train test dev sety

W przypadku time-series data, nie możemy shufflować i wziąć randomowych obserwacji do testów. Zamiast tego zbiorem testowym będzie ostatni 5% danych w time-seriesie.

In [8]:
times = sorted(main_df.index.values)  # get the times
print(times)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times
# lista wartosci times, z ostatnich najpozniejszych 5%
validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

[1528968720, 1528968780, 1528968840, 1528968900, 1528968960, 1528969020, 1528969080, 1528969140, 1528969200, 1528969260, 1528969320, 1528969380, 1528969440, 1528969500, 1528969560, 1528969620, 1528969680, 1528969740, 1528969800, 1528969860, 1528969920, 1528969980, 1528970040, 1528970100, 1528970160, 1528970220, 1528970280, 1528970340, 1528970400, 1528970460, 1528970520, 1528970580, 1528970640, 1528970700, 1528970760, 1528970820, 1528970880, 1528970940, 1528971000, 1528971060, 1528971120, 1528971180, 1528971240, 1528971300, 1528971360, 1528971420, 1528971480, 1528971540, 1528971600, 1528971660, 1528971720, 1528971780, 1528971840, 1528971900, 1528971960, 1528972020, 1528972080, 1528972140, 1528972200, 1528972260, 1528972320, 1528972380, 1528972440, 1528972500, 1528972560, 1528972620, 1528972680, 1528972740, 1528972800, 1528972860, 1528972920, 1528972980, 1528973040, 1528973100, 1528973160, 1528973220, 1528973280, 1528973340, 1528973400, 1528973460, 1528973520, 1528973580, 1528973640, 152

In [9]:
def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore. 
    # wykorzystalismy to do stworzenia kolumny target, ale teraz future jest juz bezuzyteczne, wiec to dropimy

    # normalizacja wartosci i ich przeskalowanie do [0,1]
    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic. Those nasty NaNs love to creep in.

    ################## stworzenie sekwencji

    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in
    # deque iteruje po liscie tak ze dla listy [1,2,3,4,5,6] i wart 3, pierw zwrociloby [1,2,3], 
    # potem [2,3,4], potem [3,4,5] itd
    
    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    ################## jezeli przewiduje ze cena wzrosnie to chcemy kupic wiec dodajemy sekwencje do buys
    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?  
    # rowne wielkosci zbiorow, zeby uchronic sie przed covariate shift

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.
    # mieszamy zeby nie uczyl sie zle, bo najpierw same 1 a potem same 0
    #############
    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array! ..import numpy as np
    

In [10]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [11]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 77922 validation: 3860
Dont buys: 38961, buys: 38961
VALIDATION Dont buys: 1930, buys: 1930


In [12]:
print(train_x[0])

[[-8.06472085e-04 -7.92317097e-02  3.11084780e-01 -7.98350339e-03
   2.92385543e-03 -5.79757423e-03  9.29444172e-01 -5.44231813e-02]
 [-2.44480036e+00  1.34118654e-01 -1.68557505e+00 -1.89664549e-03
  -3.53170065e+00 -2.08625071e-03 -3.18257786e+00  1.22828681e-01]
 [-2.70926122e-03 -9.06593680e-02 -1.50078297e-01 -8.10438507e-03
   2.92385543e-03 -5.83115486e-03 -2.51539274e-01 -4.97767888e-02]
 [ 1.09632013e-03 -3.73762461e-02  4.65582546e-01 -5.82669487e-03
   2.92385543e-03 -5.88631633e-03 -1.01962138e+00 -1.15848084e-02]
 [-8.06472085e-04 -4.17274883e-02 -4.57724040e-01 -7.50388614e-03
   1.17204291e+00 -5.07961481e-03  9.65445079e-01 -5.34357879e-02]
 [-1.41728355e+00 -8.75507710e-02  3.79535435e-03 -6.92919004e-03
  -3.96037779e-01 -5.81436389e-03 -9.55733525e-01 -5.27325931e-02]
 [-2.71135959e-03  2.95278265e-02 -6.11956819e-01 -7.63504391e-03
   7.42091039e-01 -5.81737841e-03  3.63856525e-02 -2.89349363e-02]
 [-8.06472085e-04 -3.74528513e-02  6.20024313e-01 -7.82012570e-03
   

In [25]:
EPOCHS = 4  # how many passes through our data
BATCH_SIZE = 2048  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model

In [26]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(128, activation='tanh', input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(tf.keras.layers.LSTM(128, activation='tanh', return_sequences=True))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.LSTM(128, activation='tanh'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

tensorboard = tf.keras.callbacks.TensorBoard(log_dir="logs/{}".format(NAME))

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

In [28]:
# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

Train on 77922 samples, validate on 3860 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [29]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))

Test loss: 0.6919331746397859
Test accuracy: 0.5158031087465237
