In [2]:
import pandas as pd
import pandas_ta as ta
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import joblib


In [59]:
class openFile:
    def __init__(self, data_path, asset, interval):
        self.data_path = data_path
        self.asset = asset
        self.interval = interval
        self.data = self.open_csv()


    def open_csv(self):
        filename = self.asset+"-"+self.interval+"-raw.csv"
        file_path = os.path.join(self.data_path, filename)
        df = pd.read_csv(os.path.abspath(file_path))
        df['datetime']=pd.to_datetime(df["openT"], utc=True, unit="ms")
        df.set_index(pd.DatetimeIndex(df["datetime"]), inplace=True)
        return df


In [168]:
open_csv = openFile('../data/raw', 'BTCUSDT', '4h')
df = open_csv.data
df.head()

Unnamed: 0_level_0,openT,open,high,low,close,baseVol,closeT,quoteVol,nbTrade,takerBaseVol,takerQuoteVol,datetime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-08-17 04:00:00+00:00,1502942400000,4261.48,4349.99,4261.32,4349.99,82.088865,1502956799999,353194.3,334,64.013727,275647.421911,2017-08-17 04:00:00+00:00
2017-08-17 08:00:00+00:00,1502956800000,4333.32,4485.39,4333.32,4427.3,63.619882,1502971199999,282501.2,248,58.787633,261054.051154,2017-08-17 08:00:00+00:00
2017-08-17 12:00:00+00:00,1502971200000,4436.06,4485.39,4333.42,4352.34,174.562001,1502985599999,774238.8,858,125.184133,555419.758061,2017-08-17 12:00:00+00:00
2017-08-17 16:00:00+00:00,1502985600000,4352.33,4354.84,4200.74,4325.23,225.109716,1502999999999,965291.1,986,165.036363,707808.200922,2017-08-17 16:00:00+00:00
2017-08-17 20:00:00+00:00,1503000000000,4307.56,4369.69,4258.56,4285.08,249.769913,1503014399999,1079545.0,1001,203.226685,878286.968557,2017-08-17 20:00:00+00:00


In [169]:
class getFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, length, factor):
        '''
        Custom transformer to get the features

        Argument:
        * length    :   period used to calculate the indicators
        * factor    :   factor to be used to calculate the indicators on (length*factor) lengths
        '''
        self.length=length
        self.factor=factor

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        length_lead = self.length
        length_lag = self.length*2

        # calculate features
        C_=pd.DataFrame()
        for i in range(1,self.factor):
            length_lead = int(self.length*i)
            length_lag = length_lead*2
            C_['mom'+str(i)] = ta.mom(X, length=length_lead)
            C_['rsi'+str(i)] = ta.rsi(X, length=length_lead)
            C_['trix'+str(i)] = ta.trix(X, length=length_lead).iloc[:,0]
            C_['macd'+str(i)] = ta.macd(X, fast=length_lead, slow=length_lag).iloc[:,0]

        return C_
    
class getTarget(BaseEstimator, TransformerMixin):
    def __init__(self, length):
        '''
        Custom transformer to get the target

        Argument:
        * length    :   period used to calculate the indicators
        '''
        self.length=length

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        length_lead = self.length
        length_lag = self.length*2

        # calculate target
        C_=pd.DataFrame()
        C_['ema'] = ta.ema(X, length=length_lead)
        C_['target'] = C_['ema'].pct_change(-length_lag) # if <0, price is likely to pump - if >0, price is likely to dump

        # create the target
        y_ = C_['target']

        return y_


In [175]:

#--- PREPROCESSING FOR TRAINING
length=7
factor=10

X_transform = getFeatures(length, factor)
X = X_transform.fit_transform(df.close)

y_transform = getTarget(length)
y = y_transform.fit_transform(df.close)

index_to_drop = X[(X.isna().any(axis=1)) | (y.isna())].index
X_=X.drop(index_to_drop)
y_=y.drop(index_to_drop)
y_=y_.apply(lambda x: 1 if x<-0.05 else 0) # Transform float to binary categories (0/1). 1 = price increase >5% in the next 2*length 
print("X_ shape: ", X_.shape)
print("y_ shape: ", y_.shape)



X_ shape:  (14055, 36)
y_ shape:  (14055,)


In [199]:
# Initialize KNeighborsRegressor
model = KNeighborsClassifier(n_neighbors=30)

# Initialize TimeSeriesSplit
tspl = TimeSeriesSplit(n_splits=10, test_size=1000)

# Perform cross_validation
scores = cross_val_score(model, X_.values, y_.values, cv=tspl, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean:", scores.mean())  # Take the mean of the negative scores
print("std:", scores.std())  # Take the mean of the negative scores

Cross-validation scores: [0.9   0.814 0.896 0.709 0.747 0.862 0.904 0.903 0.931 0.919]
Mean: 0.8584999999999999
std: 0.07271210353166797


In [206]:
# test1
print('test1: n_splits=10')
tscv = TimeSeriesSplit(n_splits=10)
for train, test in tscv.split(X_):
    print("%s %s" % (len(train), len(test)))

# test2
print('\ntest2: n_splits=1, test_size=1000')
tscv = TimeSeriesSplit(n_splits=2, test_size=1000)
for train, test in tscv.split(X_):
    print("%s %s" % (len(train), len(test)))

test1: n_splits=10
1285 1277
2562 1277
3839 1277
5116 1277
6393 1277
7670 1277
8947 1277
10224 1277
11501 1277
12778 1277

test2: n_splits=1, test_size=1000
12055 1000
13055 1000


In [201]:
1000/6

166.66666666666666