## Necessary Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob
import pickle

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

## Inputs

In [2]:
# data_folder_name='data/historical_price_data/ADABTC'   #the folder where data is stored
data_folder_name='data/historical_price_data/BTCUSDT'   #the folder where data is stored

bar_type='time'             #type of bars possible_values: dollar,time,ticks,volume
threshold=60                #threshold for the given type of bar
volatility_threshold=10     #threshold in bars for volatility which is standard deviation of returns
v_bars_duration=1           #threshold in bars for vertical_bars which denotes a dataframe in triple-barrier method
barrier_conf=[1,1]          #stop loss and profit taking limits [0]denotes stop loss and [1]denotes profit taking
min_return=0                #minimum values for return in triple-barrier method
visualize_plot=True        #flag for visualizing plots

## Loading Necessary Classes

In [3]:
#loading and preprocessing
from load_data.loadData import LoadData
from preprocessing.preProcessData import PreProcessData
from csticks.createcandleStick import createCandleStick

#labeling
from labelling.labelgenerator import LabelGenerator


# Features
from feature.featureExtraction import FeatureExtraction
from feature.featureExtractionVisual import FVisual
from feature.featureVerifyVisual import VerifyFeature

## Object Creation

In [4]:
ld_data = LoadData()
pp_data = PreProcessData()
cstk_ob = createCandleStick()
lbl_ob=LabelGenerator()

## Load Data

In [5]:
pickle_filename_main_df='MAIN_df_BTCUSDT'

try:
    MAIN_df = pd.read_pickle(pickle_filename_main_df)
    
except (OSError, IOError) as e:
    print(e)
    MAIN_df = ld_data.load_data_dir(data_folder_name)
    MAIN_df['Price'] = MAIN_df.loc[:,['Close']]
    MAIN_df.to_pickle(pickle_filename_main_df)
    
MAIN_df.head()

[Errno 2] No such file or directory: 'MAIN_df_BTCUSDT'


Unnamed: 0,Date,Open,High,Low,Close,Volume,Price
0,2017-08-17 04:00:00.000000,4261.48,4261.48,4261.48,4261.48,1.775183,4261.48
1,2017-08-17 04:01:00.000000,4261.48,4261.48,4261.48,4261.48,0.0,4261.48
2,2017-08-17 04:02:00.000000,4280.56,4280.56,4280.56,4280.56,0.261074,4280.56
3,2017-08-17 04:03:00.000000,4261.48,4261.48,4261.48,4261.48,0.012008,4261.48
4,2017-08-17 04:04:00.000000,4261.48,4261.48,4261.48,4261.48,0.140796,4261.48


In [5]:
pickle_filename_main_df='MAIN_df_BTCUSDT'

try:
    MAIN_df = pd.read_pickle(pickle_filename_main_df)
    
except (OSError, IOError) as e:
    print(e)
    MAIN_df = ld_data.load_data_dir(data_folder_name)
    MAIN_df['Price'] = MAIN_df.loc[:,['Close']].mean(axis = 1)
    MAIN_df.to_pickle(pickle_filename_main_df)
    
MAIN_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Price
0,2017-08-17 04:00:00.000000,4261.48,4261.48,4261.48,4261.48,1.775183,4261.48
1,2017-08-17 04:01:00.000000,4261.48,4261.48,4261.48,4261.48,0.0,4261.48
2,2017-08-17 04:02:00.000000,4280.56,4280.56,4280.56,4280.56,0.261074,4280.56
3,2017-08-17 04:03:00.000000,4261.48,4261.48,4261.48,4261.48,0.012008,4261.48
4,2017-08-17 04:04:00.000000,4261.48,4261.48,4261.48,4261.48,0.140796,4261.48


## Create Bars

In [6]:
pickle_filename="cstk_"+bar_type+"_"+str(threshold)+'MAIN_df_BTCUSDT'

try:
    cstk_df = pd.read_pickle(pickle_filename)
except (OSError, IOError) as e:
    print(e)
    cstk_df = cstk_ob.createBars(MAIN_df,bar_type,threshold,0)
    cstk_df.to_pickle(pickle_filename)

[Errno 2] No such file or directory: 'cstk_time_60MAIN_df_BTCUSDT'


In [7]:
cstk_df.head()

Unnamed: 0_level_0,DateStop,Open,High,Low,Close,Volume,Price
DateStart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-08-17 04:00:00,2017-08-17 04:00:00,4261.48,4261.48,4261.48,4261.48,1.775183,4261.48
2017-08-17 04:01:00,2017-08-17 04:01:00,4261.48,4261.48,4261.48,4261.48,0.0,4261.48
2017-08-17 04:02:00,2017-08-17 04:02:00,4280.56,4280.56,4280.56,4280.56,0.261074,4280.56
2017-08-17 04:03:00,2017-08-17 04:03:00,4261.48,4261.48,4261.48,4261.48,0.012008,4261.48
2017-08-17 04:04:00,2017-08-17 04:04:00,4261.48,4261.48,4261.48,4261.48,0.140796,4261.48


In [8]:
close=cstk_df['Price']

In [9]:
close.head()

DateStart
2017-08-17 04:00:00    4261.48
2017-08-17 04:01:00    4261.48
2017-08-17 04:02:00    4280.56
2017-08-17 04:03:00    4261.48
2017-08-17 04:04:00    4261.48
Name: Price, dtype: float64

## Volatility

In [10]:
ret,vol=lbl_ob.get_volatility(close,volatility_threshold)

In [11]:
vol.head()

DateStart
2017-08-17 04:01:00         NaN
2017-08-17 04:02:00    0.003166
2017-08-17 04:03:00    0.004691
2017-08-17 04:04:00    0.003632
2017-08-17 04:05:00    0.002976
Name: Price, dtype: float64

## Downsampling (To be Done)

In [12]:
timestamps=close.index

## Vertical Bars

In [13]:
v_bars = lbl_ob.vertical_barrier(close, timestamps,v_bars_duration)

In [14]:
v_bars.head()

DateStart
2017-08-17 04:00:00   2017-08-17 04:01:00
2017-08-17 04:01:00   2017-08-17 04:02:00
2017-08-17 04:02:00   2017-08-17 04:03:00
2017-08-17 04:03:00   2017-08-17 04:04:00
2017-08-17 04:04:00   2017-08-17 04:05:00
Name: DateStart, dtype: datetime64[ns]

## Triple-Barrier Method

In [15]:
events = lbl_ob.triple_barrier(close, timestamps, sltp=barrier_conf, trgt=vol, min_ret=min_return,num_threads=16,t1=v_bars,side=None)

2019-02-07 01:26:37.992921 100.0% triple_barrier_single done after 2.11 minutes. Remaining 0.0 minutes..


In [16]:
events.head()

Unnamed: 0_level_0,trgt,type,time
DateStart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-17 04:02:00,0.003166,sl,2017-08-17 04:03:00
2017-08-17 04:03:00,0.004691,t1,2017-08-17 04:04:00
2017-08-17 04:04:00,0.003632,t1,2017-08-17 04:05:00
2017-08-17 04:05:00,0.002976,t1,2017-08-17 04:06:00
2017-08-17 04:06:00,0.002514,t1,2017-08-17 04:07:00


## Create Labels

In [17]:
labels=lbl_ob.get_labels(close,events,)

In [18]:
labels.head()

Unnamed: 0_level_0,ret,label,t1,type
DateStart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-17 04:02:00,-0.004457,-1.0,2017-08-17 04:03:00,sl
2017-08-17 04:03:00,0.0,0.0,2017-08-17 04:04:00,t1
2017-08-17 04:04:00,0.0,0.0,2017-08-17 04:05:00,t1
2017-08-17 04:05:00,0.0,0.0,2017-08-17 04:06:00,t1
2017-08-17 04:06:00,0.0,0.0,2017-08-17 04:07:00,t1


In [19]:
labels.label.value_counts()

 1.0    119289
-1.0    113499
 0.0     49526
Name: label, dtype: int64

In [20]:
positive=labels[labels.label==1.0]
negative=labels[labels.label==-1.0]
neutral=labels[labels.label==0.0]

In [21]:
non_zero_labels=labels[labels.label!=0.0]

In [22]:
non_zero_labels.label.value_counts()

 1.0    119289
-1.0    113499
Name: label, dtype: int64

## Visualizations

In [23]:
from plotly import tools

if visualize_plot:
    trace0 = go.Scattergl(
        x=close.index,
        y=close,
        name='Price',
    )
    trace1 = go.Scattergl(
        x=ret.index,
        y=ret,
        name='Return',
    )
    trace2 = go.Scattergl(
        x=vol.index,
        y=vol,
        name='Volatility'
    )

    fig = tools.make_subplots(rows=3,cols=1,shared_xaxes=True)

    fig.append_trace(trace0, 1,1)
    fig.append_trace(trace1, 2,1)
    fig.append_trace(trace2, 3,1)


    fig['layout'].update(title='Plots')
    plot(fig, filename='multiple-subplots-shared-yaxes')

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x1,y2 ]
[ (3,1) x1,y3 ]




Your filename `multiple-subplots-shared-yaxes` didn't end with .html. Adding .html to the end of your file.



In [24]:
if visualize_plot:
    trace0 = go.Scattergl(
        x=labels.index,
        y=labels['ret'],
        name='Return',
    )
    trace1 = go.Scattergl(
        x=events.index,
        y=barrier_conf[0]*events['trgt'],
        name='Profit-taking',
    )
    trace2 = go.Scattergl(
        x=events.index,
        y=-barrier_conf[1]*events['trgt'],
        name='Stop-loss',
    )
    trace3 = go.Scattergl(
        x=positive.index,
        y=positive['ret'],
        name='Sell',
        mode='markers',
        marker = dict(color = 'rgba(0,255,0,0.8)'),
    )
    trace4 = go.Scattergl(
        x=negative.index,
        y=negative['ret'],
        name='Buy',
        mode='markers',
        marker = dict(color = 'rgba(255,0,0,0.8)'),
    )
    trace5 = go.Scattergl(
        x=neutral.index,
        y=neutral['ret'],
        name='Hold',
        mode='markers',
        marker = dict(color = 'rgba(0,0,255,0.8)'),
    )

    data=[trace0,trace1,trace2,trace3,trace4,trace5]

    layout = dict(title = 'Triple-Barrier Visualized',
                      xaxis= dict(title= 'DateTimeIndex',ticklen= 5,zeroline= False),
                      yaxis= dict(title= 'Returns',ticklen= 5,zeroline= False)
                     )
    fig = dict(data = data, layout = layout)
    plot(fig)
             

# Feature Extraction

In [25]:
period = 14

In [26]:
fe_ob = FeatureExtraction()
fe_vis = FVisual()
fe_verify = VerifyFeature()

## Single Features

In [27]:
# Features
start_time = time.time()
sma_df = fe_ob.simple_moving_avg(MAIN_df,period,dropna=False)
ema_df = fe_ob.exp_moving_avg(MAIN_df,period,dropna=False)
dema_df = fe_ob.double_exp_mov_avg(MAIN_df,period,dropna=False)
tema_df = fe_ob.triple_exp_moving_avg(MAIN_df,period,dropna=False)
get_five_sma = fe_ob.get_five_features(MAIN_df,mode='sma',dropna=False)
get_five_ema = fe_ob.get_five_features(MAIN_df,mode='ema',dropna=False)
BB = fe_ob.bollinger_bands(MAIN_df,period,dropna=False)
rsi_df = fe_ob.rsi(MAIN_df,dropna=False)
vpt_df = fe_ob.vpt(MAIN_df,dropna=False)
emv_df = fe_ob.emv(MAIN_df,dropna=False)
williamsr_df = fe_ob.willamsr(MAIN_df)
roc_df = fe_ob.roc(MAIN_df,dropna=False)

adl_df = fe_ob.ad_oscillaor(MAIN_df,period) # check divisions



print("--- %s seconds ---" % (time.time() - start_time))


invalid value encountered in double_scalars



--- 281.2479238510132 seconds ---


## Features of Full DataFrame

In [28]:
cols_all=['Date','sma','ema','dema','tema','rsi','bb_up','bb_dn','close',
          'sma5','sma10','sma15','sma20','sma25',
          'ema5','ema10','ema15','ema20','ema25',
          'VPT','EMV','WillamsR','ROC','ADL'
         ]

In [29]:
full_fdf = pd.DataFrame({
    'Date':pd.to_datetime(MAIN_df['Date']),
    'sma':sma_df['Close'],'ema':ema_df['Close'],'dema':dema_df['Close'],
    'tema':tema_df['Close'],'rsi':rsi_df['RSI'],'bb_up':BB[0]['Close'],'bb_dn':BB[1]['Close'],
    'Close':MAIN_df['Close'],
    'sma5':get_five_sma['sma5'],'sma10':get_five_sma['sma10'],
    'sma15':get_five_sma['sma15'],'sma20':get_five_sma['sma20'],'sma25':get_five_sma['sma25'],
    'ema5':get_five_ema['ema5'],'ema10':get_five_ema['ema10'],'ema15':get_five_ema['ema15'],
    'ema20':get_five_ema['ema20'],'ema25':get_five_ema['ema25'],
    'VPT':vpt_df['VPT'],'EMV':emv_df['EMV'], 'WilliamsR':williamsr_df['WilliamsR'],
    'ROC':roc_df['ROC'],'ADL':adl_df['ADL']
    
})

In [30]:
full_fdf=full_fdf.set_index('Date')

In [31]:
full_fdf.head(20)

Unnamed: 0_level_0,sma,ema,dema,tema,rsi,bb_up,bb_dn,Close,sma5,sma10,...,ema5,ema10,ema15,ema20,ema25,VPT,EMV,WilliamsR,ROC,ADL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-17 04:00:00,,,,,,,,4261.48,,,...,,,,,,,0.0,-100.0,,
2017-08-17 04:01:00,,,,,,,,4261.48,,,...,,,,,,0.0,0.0,-0.0,,
2017-08-17 04:02:00,,,,,,,,4280.56,,,...,,,,,,0.0,0.0,0.0,,
2017-08-17 04:03:00,,,,,,,,4261.48,,,...,,,,,,-0.001164,0.0,-100.0,,
2017-08-17 04:04:00,,,,,,,,4261.48,4265.296,,...,4264.306667,,,,,0.0,0.0,-100.0,,
2017-08-17 04:05:00,,,,,,,,4261.48,4265.296,,...,4263.364444,,,,,0.0,0.0,-100.0,,
2017-08-17 04:06:00,,,,,,,,4261.48,4265.296,,...,4262.736296,,,,,0.0,0.0,-100.0,,
2017-08-17 04:07:00,,,,,,,,4261.48,4261.48,,...,4262.317531,,,,,0.0,0.0,-100.0,,
2017-08-17 04:08:00,,,,,,,,4261.48,4261.48,,...,4262.038354,,,,,0.0,0.0,-100.0,,
2017-08-17 04:09:00,,,,,,,,4261.48,4261.48,4263.388,...,4261.852236,4262.33146,,,,0.0,0.0,-100.0,,


In [32]:
full_fdf.to_csv('./full_features_mapped_labels_BTCUSDT.csv')

## MAP all features with dates of clean labels

In [33]:
len(labels)

282314

In [34]:
a=full_fdf.index.searchsorted(non_zero_labels.index)

In [35]:
df=full_fdf.iloc[a].dropna()

In [36]:
df['label']=non_zero_labels.label

In [37]:
df.head()

Unnamed: 0_level_0,sma,ema,dema,tema,rsi,bb_up,bb_dn,Close,sma5,sma10,...,ema10,ema15,ema20,ema25,VPT,EMV,WilliamsR,ROC,ADL,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-17 04:42:00,4289.889286,4291.454755,4302.800859,4305.824487,75.143827,4319.111516,4260.667056,4300.38,4300.38,4296.766,...,4295.184198,4290.600049,4286.854865,4283.884997,0.0,3447862.0,-15.955056,0.913539,191.218432,1.0
2017-08-17 04:48:00,4304.532857,4302.181738,4313.300291,4314.100865,100.0,4314.485506,4294.580208,4310.07,4310.07,4306.194,...,4305.604489,4301.331977,4297.335642,4293.871307,0.0,3473889.0,-97.123596,0.225329,300.480762,-1.0
2017-08-17 04:49:00,4303.935,4300.825507,4309.286185,4307.676524,100.0,4315.786662,4292.083338,4292.01,4306.458,4305.357,...,4303.132763,4300.16673,4296.828438,4293.728129,-0.000986,-7194855.0,-15.955056,-0.194634,285.085461,1.0
2017-08-17 04:50:00,4304.627143,4302.058106,4310.458946,4309.012048,34.918919,4316.713938,4292.540348,4310.07,4306.458,4306.326,...,4304.394079,4301.404639,4298.089539,4294.985196,0.004594,-7194855.0,-0.0,0.225329,301.728422,1.0
2017-08-17 04:51:00,4305.572857,4303.599692,4312.216461,4311.149621,60.576293,4318.283924,4292.86179,4313.62,4307.168,4307.65,...,4306.071519,4302.931559,4299.568631,4296.418643,0.001028,2048069.0,-40.434084,0.30788,318.335116,-1.0


In [38]:
print(df.shape)

(232522, 24)


In [39]:
df.to_csv('./downsampled_features_mapped_labels_BTCUSDT.csv')