<a href="https://colab.research.google.com/github/Strata-Tech/Crypto_prediction/blob/main/Crypto_price_prediction_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('/content/drive/MyDrive/crypto_data/LTC-USD.csv',names=['time','low','high','open','close','volume'])
               


In [3]:
df.head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [4]:
#our 4 datasets have prices for Bitcoin, Litecoin, Ethereum, and Bitcoin Cash.
#combining the 4 datasets together

main_df=pd.DataFrame()
ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:
  dataset=f"/content/drive/MyDrive/crypto_data/{ratio}.csv"
  df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume']) 

  # rename volume and close to include the ticker so we can still which close/volume is which:
  df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

  df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
  df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

  if len(main_df)==0:
    main_df=df

  else:
    main_df=main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
print(main_df.head())
  

            BTC-USD_close  BTC-USD_volume  ...  ETH-USD_close  ETH-USD_volume
time                                       ...                               
1528968720    6487.379883        7.706374  ...      486.01001       26.019083
1528968780    6479.410156        3.088252  ...      486.00000        8.449400
1528968840    6479.410156        1.404100  ...      485.75000       26.994646
1528968900    6479.979980        0.753000  ...      486.00000       77.355759
1528968960    6480.000000        1.490900  ...      486.00000        7.503300

[5 rows x 8 columns]


In [5]:
#creating a target , if 1 means future price will be higher hence should buy, 0 means lower

Knowing how far out we want to predict probably also depends how long our sequences are. If our sequence length is 3 (so...3 minutes), we probably can't easily predict out 10 minutes. If our sequence length is 300, 10 might not be as hard. I'd like to go with a sequence length of 60, and a future prediction out of 3. We could also make the prediction a regression question, using a linear activation with the output layer, but, instead, we are going to  go with a binary classification.

If price goes up in 3 minutes, then it's a buy. If it goes down in 3 minutes, not buy/sell. 

In [6]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "LTC-USD"

In [7]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [8]:
#.shift will just shift the columns for us, a negative shift will shift them "up." 
#So shifting up 3 will give us the price 3 minutes in the future, and 
#we're just assigning this to a new column.
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)

In [9]:
#e've got the future values, we can use them to 
#make a target using the function we made above.
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

In [10]:
main_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759,96.470001,1
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033,96.400002,0


Normalizing and creating sequences for our data


In [None]:
#Have to seperate validation/out of sample data
#data here is sequential cannot use shuffle
#Have to slice our validation while in order
#take last 5% as validation data

In [11]:
#.values gives an array
# get the times,.index references the index, .values gets a numpy array
times = sorted(main_df.index.values) 
# get the last 5% of the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  
# make the validation data where the index is in the last 5%
validation_main_df = main_df[(main_df.index >= last_5pct)] 

we need to balance and normalize this data. By balance, we want to make sure the classes have equal amounts when training, so our model doesn't just always predict one class.

Can use class weights, which allows you to weigh loss higher for lesser-frequent classifications.

In [12]:
from sklearn import preprocessing

In [25]:
def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!
  

After scaling and normalizing our data, we need to create actual sequences

In [20]:
from collections import deque
import numpy as np
import random

In [21]:
# this is a list that will CONTAIN the sequences
sequential_data = []  
# These will be our actual sequences. They are made with deque, 
#which keeps the maximum length by popping out older values as 
#new ones come in
prev_days = deque(maxlen=SEQ_LEN) 

for i in df.values:
  #store all but the target
  # n is a list in a list till the last i
  prev_days.append([n for n in i[:-1]])
  if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences
            sequential_data.append([np.array(prev_days), i[-1]])
  
random.shuffle(sequential_data)  





Our data now has sequences,normalized, and scaled

In [23]:
#Balancing the sequence data
# list that will store our buy sequences and targets
buys = []
 # list that will store our sell sequences and targets
sells = [] 

for seq,target in sequential_data:
  #if its a "dun buy"
  if target==0:
    sells.append([seq,target])

  elif target==1:
    buys.append([seq,target])

# shuffle the buys
random.shuffle(buys) 
# shuffle the sells 
random.shuffle(sells)  

# finding out which has the shorter length("buy" vs "dun buy")
lower = min(len(buys), len(sells))

#make sure both lists are only up to the shortest length
buys = buys[:lower]
sells = sells[:lower]

#combining the 2
sequential_data = buys+sells 
random.shuffle(sequential_data)

X = []
y = []

for seq, target in sequential_data:
  X.append(seq)
  y.append(target)
return np.array(X), y


[array([[497.399994],
        [497.899994],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [497.98999 ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [497.98999 ],
        [497.600006],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.      ],
        [498.690002],
        [498.839996],
        [498.98999 ],
        [498.98999 ],
        [498.98999 ],
        [499.      ],
        [499.      ],
        [499.      ],
        [498.5     ],
        [497.709991],
        [497.899994],
        [497.899994],
        [497.899994],
        [497.929993],
        [498.440002],
        [498.440002],
        [498.440002],
        [498.380005],
        [498.190002],
        [498.100006],
        [498.109985],
        [498.109985],
        [4

In [27]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [28]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 81812 validation: 3860
Dont buys: 40906, buys: 40906
VALIDATION Dont buys: 1930, buys: 1930
