In [1]:
import pandas as pd
import pandas_ta as ta
import numpy as np
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, cross_validate
import joblib
from sklearn.metrics import accuracy_score, make_scorer


# Define Class

## openFile() - get raw data

In [2]:
class openFile:
    def __init__(self, data_path, asset, interval):
        self.data_path = data_path
        self.asset = asset
        self.interval = interval
        self.data = self.open_csv()


    def open_csv(self):
        filename = self.asset+"-"+self.interval+"-raw.csv"
        file_path = os.path.join(self.data_path, filename)
        df = pd.read_csv(os.path.abspath(file_path))
        df['datetime']=pd.to_datetime(df["openT"], utc=True, unit="ms")
        df.set_index(pd.DatetimeIndex(df["datetime"]), inplace=True)
        return df

## getFeatures & getTarget

In [3]:

class getFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, length, factor):
        '''
        Custom transformer to get the features

        Argument:
        * length    :   period used to calculate the indicators
        * factor    :   factor to be used to calculate the indicators on (length*factor) lengths
        '''
        self.length=length
        self.factor=factor

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        length_lead = self.length
        length_lag = self.length*2

        # calculate features
        C_=pd.DataFrame()
        for i in range(1,self.factor):
            length_lead = int(self.length*i)
            length_lag = length_lead*2
            C_['mom'+str(i)] = ta.mom(X, length=length_lead)
            C_['rsi'+str(i)] = ta.rsi(X, length=length_lead)
            C_['trix'+str(i)] = ta.trix(X, length=length_lead).iloc[:,0]
            C_['macd'+str(i)] = ta.macd(X, fast=length_lead, slow=length_lag).iloc[:,0]

        return C_
    
class getTarget(BaseEstimator, TransformerMixin):
    def __init__(self, ema_length, diff_length, pct_threshold=0.2, pct_threshold_activate=True):
        '''
        Custom transformer to get the target.

        Argument:
        * ema_length    :   period used to calculate the exponential moving average
        * diff_length   :   period used to calculate the pct_change in the future
        * pct_threshold :   quantile to be considered for the '1' label
        * pct_threshold_activate    :   True to return a binary 0/1 list
        '''
        self.ema_length=ema_length
        self.diff_length=diff_length
        self.pct_threshold_activate = pct_threshold_activate
        self.pct_threshold = pct_threshold

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # calculate target
        C_=pd.DataFrame()
        C_['ema'] = ta.ema(X, length=self.ema_length)
        C_['target'] = C_['ema'].pct_change(-self.diff_length) # if <0, price is likely to pump - if >0, price is likely to dump

        # create the target
        y_ = C_['target']
        if self.pct_threshold_activate:
            threshold = y_.quantile(self.pct_threshold) # define the quantile which will be used for the 1 threshold
            y_=y_.apply(lambda x: 1 if x<threshold else 0)

        return y_, threshold

## Entry price calculation

In [4]:
def get_entry_score(y_pred, price_serie):
    '''
    This function calculation the average entry price in the market with a standard daily dca vs a machine learning dca.
    The purpose is to assess the performance of the model.

    y_pred: pandas Series with datime index and binary prediction
    price_serie: pandas Series with datetime index and price action
    '''
    dca_daily_entry_price = price_serie.mean() # average entry price for a daily buy order

    index_1 = np.where(y_pred==1)[0]
    
    if len(index_1)>0:
        dca_tdbot_entry_price = price_serie.iloc[index_1].mean() # average entry price for a buy order on the basis of the trading bot signal
        entry_ratio = dca_tdbot_entry_price / dca_daily_entry_price - 1 # If <0, this ratio means that the entry price was better than doing a random daily investment
    else:
        dca_tdbot_entry_price = np.nan
        entry_ratio = np.nan
    
    return entry_ratio


# Create code

In [5]:
# OPEN DATA
open_csv = openFile('../data/raw', 'BTCUSDT', '4h')
df = open_csv.data
df.head()

Unnamed: 0_level_0,openT,open,high,low,close,baseVol,closeT,quoteVol,nbTrade,takerBaseVol,takerQuoteVol,datetime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-08-17 04:00:00+00:00,1502942400000,4261.48,4349.99,4261.32,4349.99,82.088865,1502956799999,353194.3,334,64.013727,275647.421911,2017-08-17 04:00:00+00:00
2017-08-17 08:00:00+00:00,1502956800000,4333.32,4485.39,4333.32,4427.3,63.619882,1502971199999,282501.2,248,58.787633,261054.051154,2017-08-17 08:00:00+00:00
2017-08-17 12:00:00+00:00,1502971200000,4436.06,4485.39,4333.42,4352.34,174.562001,1502985599999,774238.8,858,125.184133,555419.758061,2017-08-17 12:00:00+00:00
2017-08-17 16:00:00+00:00,1502985600000,4352.33,4354.84,4200.74,4325.23,225.109716,1502999999999,965291.1,986,165.036363,707808.200922,2017-08-17 16:00:00+00:00
2017-08-17 20:00:00+00:00,1503000000000,4307.56,4369.69,4258.56,4285.08,249.769913,1503014399999,1079545.0,1001,203.226685,878286.968557,2017-08-17 20:00:00+00:00


In [6]:
# PREPROCESSING FOR TRAINING
f_length=7
f_factor=10
t_ema_length = 7
t_diff_length = 14
t_pct_threshold=0.1 # threshold triggering the target

X_transform = getFeatures(f_length, f_factor)
X = X_transform.fit_transform(df.close)

y_transform = getTarget(t_ema_length, t_diff_length, t_pct_threshold, True)
y, threshold = y_transform.fit_transform(df.close)

index_to_drop = X[(X.isna().any(axis=1)) | (y.isna())].index
X=X.drop(index_to_drop)
y=y.drop(index_to_drop)
price_serie = df.close.drop(index_to_drop)

print("X shape: ", X.shape)
print("y shape: ", y.shape)
print("price_serie shape: ", price_serie.shape)
print("pct_threshold shape: ", t_pct_threshold)
print("threshold shape: ", threshold)

X shape:  (14106, 36)
y shape:  (14106,)
price_serie shape:  (14106,)
pct_threshold shape:  0.1
threshold shape:  -0.05908447588419685


In [7]:
# CROSS VALIDATION - ROI

# Initialize TimeSeriesSplit
tspl = TimeSeriesSplit(n_splits=10)

scores=[]

for train, test in tspl.split(X):
    print("%s %s - %s %s" % (len(train), len(test), train[-1], test[-1]))
    # Initialize KNeighborsRegressor
    model = KNeighborsClassifier(n_neighbors=30)

    # split train/test data
    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    price_test = price_serie.iloc[test]

    # train
    model.fit(X_train.values, y_train)

    # pred
    y_test_pred = model.predict(X_test.values)

    ratio = get_entry_score(y_test_pred, price_test)
    scores.append(ratio)

print("Cross-validation scores:", scores)
print("Mean:", np.mean(scores))  # Take the mean of the negative scores
print("std:", np.std(scores))  # Take the mean of the negative scores

1286 1282 - 1285 2567
2568 1282 - 2567 3849
3850 1282 - 3849 5131
5132 1282 - 5131 6413
6414 1282 - 6413 7695
7696 1282 - 7695 8977
8978 1282 - 8977 10259
10260 1282 - 10259 11541
11542 1282 - 11541 12823
12824 1282 - 12823 14105
Cross-validation scores: [0.0522073210495233, 0.24739743680790593, 0.12987510613191433, -0.03936658502178014, 0.06836952363862747, 0.027224706824707123, 0.11745007504393845, -0.04593816855902255, -0.0768093223494496, -0.019287625929229613]
Mean: 0.04611224676371347
std: 0.0941685035984869


In [8]:
# CROSS VALIDATION
# Initialize KNeighborsRegressor
model = KNeighborsClassifier(n_neighbors=30)

# Initialize TimeSeriesSplit
tspl = TimeSeriesSplit(n_splits=10, test_size=1000)

# Perform cross_validation
scores = cross_val_score(model, X.values, y.values, cv=tspl, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean:", scores.mean())  # Take the mean of the negative scores
print("std:", scores.std())  # Take the mean of the negative scores

Cross-validation scores: [0.922 0.863 0.919 0.772 0.814 0.915 0.937 0.9   0.969 0.946]
Mean: 0.8956999999999999
std: 0.058549210071528725


In [9]:
# TRAINING
test_size=1000

# split train/test
X_train = X.iloc[:-test_size]
y_train = y.iloc[:-test_size]
X_test = X.iloc[-test_size:]
y_test = y.iloc[-test_size:]
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

# Initialize KNeighborsRegressor
params = {'n_neighbors': 30}
print(params)
model = KNeighborsClassifier(**params)
#print("X_train_shape: ", X_train.shape)
model.fit(X_train.values, y_train)

# pred
y_test_pred = model.predict(X_test.values)
y_test_pred = pd.Series(y_test_pred, index=y_test.index)

# accuracy
acc=accuracy_score(y_test, y_test_pred)
print("accuracy: ", acc)

X_train shape:  (13106, 36)
y_train shape:  (13106,)
X_test shape:  (1000, 36)
y_test shape:  (1000,)
{'n_neighbors': 30}
accuracy:  0.946


# Sandpit

## Quick data analysis

In [10]:
# target distribution
y.value_counts()

target
0    12702
1     1404
Name: count, dtype: int64

In [11]:
size = len(y)

y_temp = y.iloc[-size:]

dca_daily = df.loc[y_temp.index].close.mean()
dca_bot = df.loc[y_temp[y_temp==1].index].close.mean()
print(dca_daily)
print(dca_bot)

21510.545256628386
18924.680341880343


## TimeSeriesSplit

In [12]:
# test1
print('test1: n_splits=10')
tscv = TimeSeriesSplit(n_splits=10)
for train, test in tscv.split(X):
    print("%s %s - %s %s" % (len(train), len(test), train[-1], test[-1]))

# test2
print('\ntest2: n_splits=1, test_size=1000')
tscv = TimeSeriesSplit(n_splits=10, test_size=1000)
for train, test in tscv.split(X):    
    test=test
    break
    print("%s %s - %s %s" % (len(train), len(test), train[-1], test[-1]))

test1: n_splits=10
1286 1282 - 1285 2567
2568 1282 - 2567 3849
3850 1282 - 3849 5131
5132 1282 - 5131 6413
6414 1282 - 6413 7695
7696 1282 - 7695 8977
8978 1282 - 8977 10259
10260 1282 - 10259 11541
11542 1282 - 11541 12823
12824 1282 - 12823 14105

test2: n_splits=1, test_size=1000
