# Getting Data

## Loading Libraries

In [1]:
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup # for web scraping
import requests    # for requesting html
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import os
import shutil
from tqdm.auto import tqdm
import re
from csv import reader

## Scarping data source 1

In [2]:
# to store all the raw features from source1
# source1: https://bitinfocharts.com/bitcoin/
raw_values =np.array(['transactions',
                      'size',
                      'sentbyaddress',
                      'difficulty',
                      'hashrate',
                      'mining_profitability',
                      'sentinusd',
                      'transactionfees',
                      'median_transaction_fee',
                      'confirmationtime',
                      'transactionvalue',
                      'marketcap',
                      'tweets',
                      'google_trends',
                      'mediantransactionvalue',
                      'activeaddresses',
                      'top100cap',
                      'fee_to_reward',
                      'price'])

# Technical indicators which are derived from raw values for each period
technical_indicators = np.array(['sma','ema','wma','std','mom','var','trx','rsi','roc'])

                     
period_in_days = np.array(['3','7','14','30','90'])

In [3]:
# generating URLs to scrape data and creating column names:

# Raw values
URLs = []
feature_names= []
for i in range(len(raw_values)):
    url='https://bitinfocharts.com/comparison/'+'bitcoin'+'-'+raw_values[i]+'.html#alltime'
    URLs.append(url)
    # creating column names:
    feature_names.append(raw_values[i])


In [4]:
# Joining feature_names and URLs into a dataframe
details = {'Features':feature_names,'URLs': URLs}
details_df = pd.DataFrame(details)

features=pd.DataFrame(columns=details_df.Features)
print('Building URLs ...')
for i in tqdm(range(len(features.columns))):
    date=features.columns[i] + 'Date'
    features[date]=date

Building URLs ...


  0%|          | 0/19 [00:00<?, ?it/s]

In [5]:
df_merge = 0
print('Requesting data..............')
for i in tqdm(range(len(details_df))):
    url = details_df.URLs[i]
    s = requests.Session()    
    retry = Retry(connect=10, backoff_factor=3) 
    #https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html

    adapter = HTTPAdapter(max_retries=retry)
    s.mount('http://', adapter)   
    #https://www.kite.com/python/docs/requests.adapters.HTTPAdapter
    page=s.get(url)
    soup = BeautifulSoup(page.content, 'html').find_all('script')[4].string

    text= re.search(r'd = new Dygraph\(document.getElementById\(\"container\"\)\,(.*)',soup).groups(1)
    # https://stackoverflow.com/questions/65403953/webscraping-js-elements-with-soup-isnt-working
    # https://stackoverflow.com/questions/27881366/regular-expressions-and777
    data = text[0].split('{labels:')[0]
    data = data[0:len(data)-2]
    data = re.sub(r'\[new Date\(\"', '', str(data))
    data = re.sub(r'\"\)\,', ';', str(data))
    data = re.sub(r'\]\]', '', str(data))
    data = re.sub(r'\[', '', str(data)).split('],')
    df = pd.DataFrame( list(reader(data)))
    df.columns = ['name']
    col = details_df.Features[i]
    df[['Date',col]] = df.name.str.split(';',1,expand=True)
    df.drop(['name'], axis=1,inplace = True)

    df = df.set_index(['Date'])
    df_new = df.loc['2010/07/17':'2022/07/17']
    df_new.reset_index(inplace = True)
    # https://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates
    if i==0:
      df_merge = df_new
    else:
      #df_new = df_new.drop(['Date'], axis = 1)
      #df_merge = pd.concat([df_merge,df_new],axis=1)
      df_merge = pd.merge(df_merge, df_new, left_on='Date', right_on='Date', how='left')
df_merged = pd.DataFrame(df_merge)

Requesting data..............


  0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
# saving data to csv file
df_merged.to_csv('btc_data_raw.csv', sep = ',', index=False)

In [7]:
data_raw_1 = pd.read_csv('btc_data_raw.csv')

In [8]:
data_raw_1.interpolate(axis=0,inplace=True)

In [9]:
data_raw_1.rename(columns={'price':'avg_price'}, inplace=True)

## Scarping data source 2

In [10]:
# Fear and Greed Index
# source2: https://alternative.me/crypto/fear-and-greed-index/

from bs4 import BeautifulSoup
import requests
import pandas as pd
from csv import reader
url = 'https://api.alternative.me/fng/?limit=1500&format=csv&date_format=us'
text = requests.get(url)
soup = BeautifulSoup(text.content)

In [11]:
from datetime import datetime
data_lst = soup.p.text.split('\n')[4:-5]
df_fg = pd.DataFrame( list(reader(data_lst)))
df_fg.columns = ['Date','Value','Classification']
df_fg['Date'] = pd.to_datetime(df_fg.Date, format='%m-%d-%Y')
df_fg['Date'] = df_fg['Date'].dt.strftime('%Y/%m/%d')
df_fg.sort_values(by=['Date'],inplace=True)
data2 = df_fg

In [12]:
data_raw_2 = pd.DataFrame(data2)

In [13]:
data_raw_2.drop('Classification',axis = 1,inplace=True)

In [14]:
data_raw_2.head(2)

Unnamed: 0,Date,Value
1499,2018/07/15,32
1498,2018/07/16,36


In [15]:
data_raw_2.rename(columns={'Value':'fear_gear_index'}, inplace=True)

In [16]:
data_raw_2['fear_gear_index'] = data_raw_2['fear_gear_index'].astype(int)

## Scarping data source 3

In [17]:
# source3 : https://investpy.readthedocs.io/_api/crypto.html
!pip install investpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting investpy
  Downloading investpy-1.0.8.tar.gz (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 6.1 MB/s 
[?25hCollecting Unidecode>=1.1.1
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 33.6 MB/s 
Building wheels for collected packages: investpy
  Building wheel for investpy (setup.py) ... [?25l[?25hdone
  Created wheel for investpy: filename=investpy-1.0.8-py3-none-any.whl size=4481592 sha256=57930fec62055efbeb9c5c33a575614ad691d5dee1faf4747c831d85723bea07
  Stored in directory: /root/.cache/pip/wheels/96/a8/a5/0d33c72eaf00b41df7b9dc1e15d2b7c7154b3f1379ed350211
Successfully built investpy
Installing collected packages: Unidecode, investpy
Successfully installed Unidecode-1.3.4 investpy-1.0.8


In [18]:
import investpy
data_raw_3 = investpy.crypto.get_crypto_historical_data(crypto='bitcoin',from_date='01/01/2010',to_date='31/05/2022')
data_raw_3.reset_index(inplace=True)
data_raw_3.drop(['Volume','Currency'],axis=1,inplace=True)

In [19]:
from google.colab import output
output.enable_custom_widget_manager()

In [20]:
from google.colab import output
output.disable_custom_widget_manager()

In [21]:
data_raw_3 = pd.DataFrame(data_raw_3)

In [22]:
import datetime as dt
data_raw_3['Date'] = data_raw_3['Date'].dt.strftime('%Y/%m/%d')

## Scraping data source 4

In [23]:
#source4:  https://data.nasdaq.com/data/BCHAIN/MIREV-bitcoin-miners-revenue
!pip install Quandl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Quandl
  Downloading Quandl-3.7.0-py2.py3-none-any.whl (26 kB)
Collecting inflection>=0.3.1
  Downloading inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: inflection, Quandl
Successfully installed Quandl-3.7.0 inflection-0.5.1


In [24]:
import quandl
data_raw_4 = quandl.get("BCHAIN/MIREV",authtoken='Bi24yBMurcVhy7VEQ7oC')
data_raw_4.reset_index(inplace=True)
data_raw_4['Date'] = data_raw_4['Date'].dt.strftime('%Y/%m/%d')

In [25]:
data_raw_4.rename(columns={'Value':'miners_revenue'}, inplace=True)

In [26]:
data_raw_4 = pd.DataFrame(data_raw_4)

## other features

In [27]:
data_raw_5 = pd.DataFrame()

In [28]:
data_raw_5['Date'] = data_raw_1['Date'] 

In [29]:
data_raw_5['coins_in_supply'] = data_raw_1['marketcap']/data_raw_1['avg_price']

In [30]:
data_raw_1.drop(['marketcap'], axis = 1,inplace=True)

### Note:
Market capital is the product of average bitcoin prices and BTC coins in supply. New feature 'coins_in_supply' is extracted from market cap feature and same is dropped from the dataset.

## Merging Data

In [31]:
data_raw_24 = pd.merge(data_raw_1, data_raw_3, on='Date')

In [32]:
data_raw_24 = pd.merge(data_raw_24, data_raw_4, on='Date')

In [33]:
data_raw_24 = pd.merge(data_raw_24, data_raw_5, on='Date')

In [34]:
data_raw_25 = pd.merge(data_raw_24, data_raw_2, on='Date')

## Target Shifting

In [35]:
def target_shift(df,col):
  df['next_day_BTC_price'] = col
  df['next_day_BTC_price'] = df['next_day_BTC_price'].shift(-1,fill_value=1).values
  df = df.iloc[:-1,:]
  return df

In [36]:
data_raw_24 = target_shift(data_raw_24,data_raw_24['Close'])

In [37]:
data_raw_25 = target_shift(data_raw_25,data_raw_25['Close'])

## Saving data to file

In [42]:
from google.colab import output
output.enable_custom_widget_manager()

In [41]:
from google.colab import output
output.disable_custom_widget_manager()

In [38]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
data_raw_24.to_csv('btc_raw_24.csv',sep=',',index=False)
data_raw_25.to_csv('btc_raw_25.csv',sep=',',index=False)