In [35]:
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np

In [2]:
#creating empty dataframe
main_df = pd.DataFrame() #empty dataframe

In [3]:
ratios = ["BTC-USD" , "LTC-USD", "ETH-USD", "BCH-USD"]


In [4]:
for ratio in ratios:  #iteration
    print(ratio)
    dataset = f'crypto_data/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)

BTC-USD
LTC-USD
ETH-USD
BCH-USD


In [5]:
SEQ_LEN = 60 #60min long 
FUTURE_PERIOD_PREDICT = 3 #3mins
RATIO_TO_PREDICT = "LTC-USD"

In [27]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

    

In [7]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  \
time                                                                       
1528968720      486.01001       26.019083     870.859985       26.856577   
1528968780      486.00000        8.449400     870.099976        1.124300   
1528968840      485.75000       26.994646     870.789978        1.749862   
1528968900      486.00000       77.355759     870.000000        1.680500   
1528968960 

In [8]:
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))


In [9]:
print(main_df[[f'{RATIO_TO_PREDICT}_close',"future","target"]].head())

            LTC-USD_close     future  target
time                                        
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1
1528968960      96.519997  96.400002       0


In [11]:
times = sorted(main_df.index.values) #time should be in order #get the times

In [13]:
last_5pct = times[-int(0.05*len(times))] #get the last 5percent of times
#print(last_5pct)

1534922100


In [15]:
#split up the data
validation_main_df = main_df[(main_df.index >= last_5pct)]#make the validation data where the index is in last 5%
main_df = main_df[(main_df.index < last_5pct)] #now the main data is the data upto last 5%

In [41]:
#Normalize and scale data
def preprocess_df(df):
    df = df.drop('future', 1) #dnt need this
    for col in df.columns: #go through all the cols
        if col != "target": #normalize all except for target
            df[col] = df[col].pct_change() #pct_change - normalize different currencies with percent 
            df.dropna(inplace=True)  #remove the nan created by pct_change
            df[col] = preprocessing.scale(df[col].values) #scale the data to get between 0 and 1
    df.dropna(inplace=True) #if nan cleanup again
preprocess_df(main_df)
    


In [40]:
    sequential_data = [] #list that will contain the sequences
    prev_days = deque(maxlen=SEQ_LEN) #These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in
    for i in df.values:
        prev_days.append([n for n in i[:-1]]) # store all except the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days) , i[-1]]) #append features and labels
    random.shuffle(sequential_data)  # shuffle for good measure.
            
        