In [5]:
import os
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

DATA_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'Bitcoin_071911_081921.csv')

df = pd.read_csv(DATA_PATH)

print(df.head(10))

           Date     Price      Open      High       Low    Vol. Change %
0  Aug 19, 2021  45,694.6  44,725.9  45,767.1  43,986.7  74.22K    2.17%
1  Aug 18, 2021  44,723.8  44,691.1  46,004.9  44,230.0  22.36K    0.07%
2  Aug 17, 2021  44,691.6  45,907.5  47,140.6  44,441.3  84.14K   -2.84%
3  Aug 16, 2021  45,996.3  46,991.6  48,002.4  45,672.1   2.41K   -2.12%
4  Aug 15, 2021  46,991.3  47,082.6  47,344.1  45,564.1  58.11K   -0.19%
5  Aug 14, 2021  47,081.5  47,809.6  48,090.9  46,117.2  64.29K   -1.52%
6  Aug 13, 2021  47,809.1  44,404.0  47,836.0  44,242.4  75.27K    7.67%
7  Aug 12, 2021  44,403.4  45,562.3  46,213.8  43,814.7  81.07K   -2.55%
8  Aug 11, 2021  45,564.3  45,593.2  46,736.5  45,367.9  75.43K   -0.06%
9  Aug 10, 2021  45,593.8  46,281.8  46,663.4  44,681.2  80.55K   -1.49%


In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from plotly import tools
# import plotly.plotly as py
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import gc

import matplotlib.pyplot as plt
import seaborn as sns

#import os
#print(os.listdir("../input"))

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

In [14]:
import datetime, pytz
#define a conversion function for the native timestamps in the csv file
def dateparse (time_in_secs):    
    return pytz.utc.localize(datetime.datetime.fromtimestamp(float(time_in_secs)))


data = pd.read_csv(DATA_PATH, parse_dates=[0], date_parser=pd.to_datetime)
print(data.head(10))

        Date     Price      Open      High       Low    Vol. Change %
0 2021-08-19  45,694.6  44,725.9  45,767.1  43,986.7  74.22K    2.17%
1 2021-08-18  44,723.8  44,691.1  46,004.9  44,230.0  22.36K    0.07%
2 2021-08-17  44,691.6  45,907.5  47,140.6  44,441.3  84.14K   -2.84%
3 2021-08-16  45,996.3  46,991.6  48,002.4  45,672.1   2.41K   -2.12%
4 2021-08-15  46,991.3  47,082.6  47,344.1  45,564.1  58.11K   -0.19%
5 2021-08-14  47,081.5  47,809.6  48,090.9  46,117.2  64.29K   -1.52%
6 2021-08-13  47,809.1  44,404.0  47,836.0  44,242.4  75.27K    7.67%
7 2021-08-12  44,403.4  45,562.3  46,213.8  43,814.7  81.07K   -2.55%
8 2021-08-11  45,564.3  45,593.2  46,736.5  45,367.9  75.43K   -0.06%
9 2021-08-10  45,593.8  46,281.8  46,663.4  44,681.2  80.55K   -1.49%


In [27]:
# First thing is to fix the data for bars/candles where there are no trades. 
# Volume/trades are a single event so fill na's with zeroes for relevant fields...
data['Vol.'].fillna(value=0, inplace=True)
data['Price'].str.replace(',', '').astype('float64').fillna(value=0, inplace=True)

# next we need to fix the OHLC (open high low close) data which is a continuous timeseries so
# lets fill forwards those values...
data['Open'].str.replace(',', '').astype('float64').fillna(method='ffill', inplace=True)
data['High'].str.replace(',', '').astype('float64').fillna(method='ffill', inplace=True)
data['Low'].str.replace(',', '').astype('float64').fillna(method='ffill', inplace=True)
data['Change %'].fillna(method='ffill', inplace=True)

data.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-08-19,45694.6,44725.9,45767.1,43986.7,74.22K,2.17%
1,2021-08-18,44723.8,44691.1,46004.9,44230.0,22.36K,0.07%
2,2021-08-17,44691.6,45907.5,47140.6,44441.3,84.14K,-2.84%
3,2021-08-16,45996.3,46991.6,48002.4,45672.1,2.41K,-2.12%
4,2021-08-15,46991.3,47082.6,47344.1,45564.1,58.11K,-0.19%


In [28]:
# create valid date range
start = pd.Timestamp(2011, 7, 19, 0, 0, 0, 0)
end = pd.Timestamp(2021, 8, 19, 0, 0, 0, 0)

# find rows between start and end time and find the first row (00:00 monday morning)
weekly_rows = data[(data['Date'] >= start) & (data['Date'] <= end)].groupby([pd.Grouper(key='Date', freq='W-MON')]).first().reset_index()
weekly_rows.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2011-07-25,14.1,14.0,14.7,13.8,31.58K,0.50%
1,2011-08-01,13.1,13.4,13.6,12.9,21.75K,-1.95%
2,2011-08-08,7.8,7.9,8.2,7.1,34.04K,-1.27%
3,2011-08-15,11.1,10.8,11.9,10.7,38.16K,3.24%
4,2011-08-22,10.9,11.3,11.5,10.5,37.00K,-3.63%


In [32]:
# We use Plotly to create the plots https://plot.ly/python/
trace1 = go.Scatter(
    x = weekly_rows['Date'],
    y = weekly_rows['Open'].str.replace(',', '').astype('float64'),
    mode = 'lines',
    name = 'Open'
)

trace2 = go.Scatter(
    x = weekly_rows['Date'],
    y = weekly_rows['High'].str.replace(',', '').astype('float64'),
    mode = 'lines',
    name = 'High'
)
trace3 = go.Scatter(
    x = weekly_rows['Date'],
    y = weekly_rows['Price'].str.replace(',', '').astype('float64'),
    mode = 'lines',
    name = 'Weighted Avg'
)

layout = dict(
    title='Historical Bitcoin Prices (2011-2021) with the Slider ',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                #change the count to desired amount of months.
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(count=12,
                     label='1y',
                     step='month',
                     stepmode='backward'),
                dict(count=36,
                     label='3y',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(
            visible = True
        ),
        type='date'
    )
)

data = [trace1,trace2, trace3]
fig = dict(data=data, layout=layout)
iplot(fig, filename = "Time Series with Rangeslider")

In [35]:
trace1 = go.Scatter(
    x = weekly_rows['Date'],
    y = weekly_rows['Vol.'].replace(r'[KM]+$', '', regex=True).astype('float64') * weekly_rows['Vol.'].str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int),
    mode = 'lines',
    name = 'Bitcoin Price (Open)'
)

layout = dict(
    title='Historical Bitcoin Volume (USD) (2011-2021) with the slider',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(count=12,
                     label='1y',
                     step='month',
                     stepmode='backward'),
                dict(count=36,
                     label='3y',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(
            visible = True
        ),
        type='date'
    )
)

data = [trace1]
fig = dict(data=data, layout=layout)
iplot(fig, filename = "Time Series with Rangeslider")

In [36]:
#BTC Volume vs USD visualization
trace = go.Scattergl(
    y = weekly_rows['Vol.'].replace(r'[KM]+$', '', regex=True).astype('float64') * weekly_rows['Vol.'].str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int),
    x = weekly_rows['Price'].str.replace(',', '').astype('float64'),
    mode = 'markers',
    marker = dict(
        color = '#FFBAD2',
        line = dict(width = 1)
    )
)
layout = go.Layout(
    title='BTC Volume v/s USD',
    xaxis=dict(
        title='Weighted Price',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Volume BTC',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )))
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='compare_webgl')

In [None]:
# Time Series forecasting with XGBoost

In [42]:
import seaborn as sns
import matplotlib.pyplot as plt
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
plt.style.use('fivethirtyeight')

In [None]:
data = pd.read_csv('../input/coinbaseUSD_1-min_data_2014-12-01_to_2018-11-11.csv',parse_dates=[0], date_parser=dateparse) 
data['Date'] = data['Date'].dt.tz_localize(None)
data = data.groupby([pd.Grouper(key='Date', freq='H')]).first().reset_index()
data = data.set_index('Timestamp')
data = data[['Weighted_Price']]
data['Weighted_Price'].fillna(method='ffill', inplace=True)