In [2]:
import requests
from random import sample
from datetime import datetime, timedelta
import json
import pandas as pd
from tqdm import tqdm

In [3]:
def readBlock(hash, printouts=False):
    # api-endpoint
    # URL = "https://blockchain.info/rawblock/000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f?format=hex"
    # URL = "https://blockchain.info/rawblock/00000000000000001e8d6829a8a21adc5d38d0a473b144b6765798e61f98bd1d?format=hex"
    URL = "https://blockchain.info/rawblock/" + hash + "?format=hex"

    # sending get request and saving the response as response object
    r = requests.get(url = URL) #, params = PARAMS)
    
    # print(r.content)
    if printouts: print('text: ' + r.text)
    if printouts: print('encoding: ' + r.encoding)
    # print('type: ' + type(r.text))

    blockHex = r.text
    # head = r.text[:size2]
    global curr
    curr = 0

    def read(size):
        global curr
        prev = curr
        curr += size * 2
    #     if size == 1:
    #         print("prev: ", prev)
    #         print("curr: ", curr)
    #         print("head sect: ", blockHex[prev:curr])
        return blockHex[prev:curr]

    def readInt(size=4):
        tempArr = bytearray(bytes.fromhex(read(size)))
    #     if size == 1:
    #         print("tempArr: ", tempArr)
        return int.from_bytes(tempArr, "little")

    def readHash(size=32):
        tempArr = bytearray(bytes.fromhex(read(32)))
        tempArr.reverse()
        return tempArr

    def readVarInt():
        size = readInt(1)
    #     print("readvarint size: ", size)
        if size < 253:
            return size
        elif size == 253:
            return readInt(2)
        elif size == 254:
            return readInt(4)
        elif size == 255:
            return readInt(8)
        return -1

    def readTxInput():
        tx_input = {}
        tx_input["prev_out"] = readHash().hex()
        tx_input["txOutId"] = readInt(4)
        tx_input["scriptLen"] = readVarInt()
        tx_input["script"] = read(tx_input["scriptLen"])
        tx_input["sequence"] = readInt(4)
        return tx_input

    def readTxInputs(vin_sz):
        outputs = []
        for i in range(0, vin_sz):
            outputs.append(readTxInput())
        return outputs
        
    def readTxOutput():
        tx_output = {}
        tx_output["value"] = readInt(8)
        tx_output["scriptLen"] = readVarInt()
        tx_output["pubkey"] = read(tx_output["scriptLen"])
        return tx_output
        
    def readTxOutputs(vout_sz):
        outputs = []
        for i in range(0, vout_sz):
            outputs.append(readTxOutput())
        return outputs
        
    def readTx(n_tx):
        outputs = []
        for i in range(0, n_tx):
            tx = {}
            tx["ver"] = readInt()
            tx["vin_sz"] = readVarInt()
            tx["inputs"] = readTxInputs(tx["vin_sz"])
            tx["vout_sz"] = readVarInt()
            tx["outputs"] = readTxOutputs(tx["vout_sz"])
            tx["lock_time"] = readInt()
            outputs.append(tx)
        return outputs

    returnObj = {}

    if printouts: print("===HEADER===")

    returnObj["ver"] = readInt()
    if printouts: print("ver: ", returnObj["ver"])

    returnObj["prev_block"] = readHash().hex()
    if printouts: print("prev_block: ", returnObj["prev_block"])

    returnObj["mrkl_root"] = readHash().hex()
    if printouts: print("mrkl_root: ", returnObj["mrkl_root"])

    returnObj["time"] = readInt()
    if printouts: print("time: ", returnObj["time"])
    
    URL2 = "https://min-api.cryptocompare.com/data/v2/histoday?fsym=BTC&tsym=USD&limit=1&aggregate=1&toTs=" + str(returnObj["time"])
    r2 = requests.get(url=URL2).json()
    #print("r2: ", r2)
    returnObj["price"] = r2["Data"]["Data"][-1]["close"]
#     returnObj["price"] = json.loads(r2).get("close")

    returnObj["bits"] = readInt()
    if printouts: print("bits: ", returnObj["bits"])

    returnObj["nonce"] = readInt()
    if printouts: print("nonce: ", returnObj["nonce"])

    if printouts: print("===TRANSACTIONS===")

    returnObj["n_tx"] = readVarInt()
    if printouts: print("n_tx: ", returnObj["n_tx"])

    returnObj["tx"] = readTx(returnObj["n_tx"])
    if printouts: print("tx: ", returnObj["tx"])
        
    returnObjJson = json.dumps(returnObj)
    if printouts: print(json.dumps(returnObj, indent=2, sort_keys=False))

    return json.loads(returnObjJson)

In [8]:
def getUnixTimestampBetweenDates(start_y, start_m, start_d):
  return str(int(datetime(start_y, start_m, start_d).timestamp() * 1000))

1551416400000


In [9]:
def getBitcoinData(start_y, start_m, start_d):
  
  date = getUnixTimestampBetweenDates(start_y, start_m, start_d)

  # key: unix time in millisecondm, value: (num_transaction, high, low, avg, block_height)
  final_data = {}

  # get block data for the day
  api_url = f'https://blockchain.info/blocks/{date}?format=json'
  response = requests.get(api_url)
  blocks = response.json()
  sampled_blocks = sample(blocks, 10)
  num_transactions = 0
  avg = 0

  for block in tqdm(sampled_blocks):
    # get data for each randomly sampled block
    block_hash = block["hash"]
    result = readBlock(block_hash)
    num_transactions += result["n_tx"]
    avg += result["price"]
  
  avg /= 10
  num_blocks = len(blocks)
  num_transactions *= num_blocks // 10

  data_seconds = str(int(date) // 1000)
  URL2 = "https://min-api.cryptocompare.com/data/v2/histoday?fsym=BTC&tsym=USD&limit=1&aggregate=1&toTs=" + data_seconds
  r2 = requests.get(url=URL2).json()
  low = r2["Data"]["Data"][-1]["low"]
  high = r2["Data"]["Data"][-1]["high"]
  final_data[date] = (num_transactions, high, low, round(avg, 4), num_blocks)
  
  return final_data  

In [10]:
def bitcoin_data_to_csv(start_y, start_m, start_d, csv_path):
  bc_data = getBitcoinData(start_y, start_m, start_d)
  df = pd.DataFrame.from_dict(bc_data, orient='index', columns=['n_tx', 'high', 'low', 'avg', "height"])
  df.to_csv(csv_path)

In [22]:
def gen_bitcoin_data_daily(start_y, start_m, start_d):
  for m in range(start_m, 13):
    for d in range(start_d, 31):
      if m == 2 and d > 28:
        # Feb
        break
      bitcoin_data_to_csv(start_y, m, d, f'data/{start_y}_{m}_{d}_bitcoin.csv')
      print(f'{start_y}/{m}/{d} DONE.')



In [24]:
# Jeffrey runs this
gen_bitcoin_data_daily(2019, 1, 1)

 30%|███       | 3/10 [00:07<00:17,  2.44s/it]

In [None]:
# Simon runs this
gen_bitcoin_data_daily(2020, 1, 1)

In [None]:
# Lawrence runs this
gen_bitcoin_data_daily(2021, 1, 1)