In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from datetime import datetime
from orderedset import OrderedSet
from time import sleep

## Get transactions from 1 address
Seems like this API limits this to 10 000 txs.

In [2]:
def get_transactions_from_address(address,start_block,end_block):# api-endpoint
  URL = "https://api.etherscan.io/api"
  
  # parameters
  address = "0x4ddc2d193948926d02f9b1fe9e1daa0718270ed5"  
  api_key = "8FHTNPR8GQPAEVSRHBVDZ4TWEJQCCUB1NX"
  # defining a params dict for the parameters to be sent to the API
  PARAMS = {'module':'account','action':'txlist','address':address,'startblock':start_block,'end_block':end_block,'apikey':api_key}
  
  # sending get request and saving the response as response object
  r = requests.get(url = URL, params = PARAMS)
  
  # extracting data in json format
  data = r.json()

  if len(data["result"]) > 0:
      # Load response into Dataframe and print it
      df = pd.DataFrame(data['result'])

      # Throw away some columns
      df = df.loc[df['isError'] == "0"]      

      df = df[["timeStamp","from","to","value","gas",'isError',"gasUsed"]]

      # Convert unix timestamp into something readable
      df['timeStamp'] = df['timeStamp'].apply(lambda ts: datetime.fromtimestamp(int(ts)).strftime("%c"))
      if len(df) == 10000:
        print("[Warning] 10k txs")
  
  return df 


## Load addresses with highest balances
I copied these addresses manually btw, seems like there is no API call for it

In [3]:
## Get addresses with highest balance
addresses = []
with open("top_1000_addresses.txt",'r') as f:
    addresses = list(f.read().split("\n"))

## BFS walk for 1 month period

In [4]:
end_block = 12204609
start_block = 12003173
end_block = start_block+int((end_block-start_block)/4)

# Only collect 100 million txs
collect_max = 1e8
collected = 0

output_file = "randomwalk_{}_{}.txt".format(start_block,end_block)
f = open(output_file,'w')

queue = OrderedSet(addresses[:100])
seen_addresses = set()

while len(queue) > 0 and collected < collect_max:
    address = queue.pop()
    
    df = get_transactions_from_address(address,start_block,end_block)
    collected += len(df) # Count new txs

    # Add all unseen addresses to queue
    seen_addresses.add(address)
    new_addresses = set([x for x in list(df["from"])+list(df["to"]) if x not in seen_addresses])
    for new in new_addresses:
        queue.add(new)

    # Record this address
    sleep(0.25) # Wait 0.20s because we are limited to 5tx/s

    # Append dataframe in CSV format to output file
    f.write(df.to_csv())
    print("Q = {}, Txs: {} of {}, New txs: {}, seen addr: {}".format(len(queue),collected,collect_max,len(df),len(seen_addresses)))

Q = 3248, Txs: 6096 of 100000000.0, New txs: 6096, seen addr: 1
Q = 3247, Txs: 12192 of 100000000.0, New txs: 6096, seen addr: 2
Q = 3246, Txs: 18288 of 100000000.0, New txs: 6096, seen addr: 3
Q = 3245, Txs: 24384 of 100000000.0, New txs: 6096, seen addr: 4
Q = 3244, Txs: 30480 of 100000000.0, New txs: 6096, seen addr: 5
Q = 3243, Txs: 36576 of 100000000.0, New txs: 6096, seen addr: 6
Q = 3242, Txs: 42672 of 100000000.0, New txs: 6096, seen addr: 7
Q = 3241, Txs: 48768 of 100000000.0, New txs: 6096, seen addr: 8
Q = 3240, Txs: 54864 of 100000000.0, New txs: 6096, seen addr: 9
Q = 3239, Txs: 60960 of 100000000.0, New txs: 6096, seen addr: 10
Q = 3238, Txs: 67056 of 100000000.0, New txs: 6096, seen addr: 11
