In [1]:
!pip install OrderedSet
!pip install numpy
!pip install pandas
!pip install matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from datetime import datetime
from orderedset import OrderedSet
from time import sleep
import sys



## Get transactions from 1 address
Seems like this API limits this to 10 000 txs.

In [2]:
def get_transactions_from_address(address,start_block,end_block):
  URL = "https://api.etherscan.io/api"
  
  # parameters
  api_key = "8FHTNPR8GQPAEVSRHBVDZ4TWEJQCCUB1NX"
  # defining a params dict for the parameters to be sent to the API
  PARAMS = {'module':'account','action':'txlist','address':address,'startblock':start_block,'endblock':end_block,'apikey':api_key}
  
  # sending get request and saving the response as response object
  try:
    r = requests.get(url = URL, params = PARAMS)
  except Exception:
    # If connection error, skip this address
    print("[Error] Error connecting to API, skipping address...")
    return pd.DataFrame({"from":[],"to":[],"value":[]}),0  
  
  # extracting data in json format
  data = r.json()
  if data["status"] != "0":
    print("{}".format(data["message"]))

  if len(data["result"]) > 0:
      # Load response into Dataframe
      try:
        df = pd.DataFrame(data['result'])
      except Exception:
        print("Error reading dataFrame from values, written to error.txt")
        with open("error.txt",'w') as f:
          f.write(data["result"])
          return pd.DataFrame({"from":[],"to":[],"value":[]}),0

      # Throw away some weird values
      df = df.loc[df['isError'] == "0"]
      df = df.loc[df["value"] != "0"]

      full_txn_count = len(df)

      # df = df[["timeStamp","from","to","value","gas",'isError',"gasUsed"]]
      df = df[["from","to","value"]]

      # Convert unix timestamp into something readable
      # df['timeStamp'] = df['timeStamp'].apply(lambda ts: datetime.fromtimestamp(int(ts)).strftime("%c"))

      # Group txs by (from,to) and sum value
      df['value'] = df['value'].apply(lambda x: int(x))   # Convert value str to int
      df = df.groupby(['from', 'to']).agg({'value':'sum'})
      df = df.reset_index(level=df.index.names)

      if len(df) == 10000:
        print("[Warning] 10k txs")
      return df,full_txn_count
  
  return pd.DataFrame({"from":[],"to":[],"value":[]}),0

## Load addresses with highest balances
I copied these addresses manually btw, seems like there is no API call for it

In [3]:
## Get addresses with highest balance
addresses = []
with open("top_1000_addresses.txt",'r') as f:
    addresses = list(f.read().split("\n"))

## BFS walk for 1 month period

In [4]:
# Starting Jan 17, a snapshot each 8 months
# Block numbers from https://blockchair.com/ethereum/
snapshot_blocklimits = []
snapshot_blocklimits.append((2912407,3100153))    # Jan '17
snapshot_blocklimits.append((4467005,4652925))    # Nov '17
snapshot_blocklimits.append((6249399,6430272))    # Sep '18
snapshot_blocklimits.append((8062293,8261511))    # Jul '19
snapshot_blocklimits.append((9976964,10176689))    # May '20
snapshot_blocklimits.append((11948960,12150244))    # Mar '21

# Only collect 100 million txs
collect_max = 1e6
collected = 0

# Get start and end block
snapshot = 2
start_block = snapshot_blocklimits[snapshot][0]
end_block = snapshot_blocklimits[snapshot][1]

output_file = "Ntxs_bf_blocks{}_{}.txt".format(start_block,end_block)
f = open(output_file,'w',buffering=1)

queue = OrderedSet(addresses[:100])
seen_addresses = set()

while len(queue) > 0 and collected < collect_max:
    address = queue.pop()
    
    df, n = get_transactions_from_address(address,start_block,end_block)
    collected += n # Count new txs

    # Add all unseen addresses to queue
    seen_addresses.add(address)
    new_addresses = set([x for x in list(df["from"])+list(df["to"]) if x not in seen_addresses])
    for new in new_addresses:
        if len(queue) < 10000:
            queue.add(new)

    # sleep(0.25) # Wait 0.20s because we are limited to 5tx/s

    # Append dataframe in CSV format to output file
    f.write(df.to_csv(header=False,index=False))
    print("Txs: {} of {}, New txs: {}, seen addr: {}, Q = {} ({} new), ".format(collected,collect_max,len(df),len(seen_addresses),len(queue),len(new_addresses)))

seen addr: 6, Q = 94 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 7, Q = 93 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 8, Q = 92 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 9, Q = 91 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 10, Q = 90 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 11, Q = 89 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 12, Q = 88 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 13, Q = 87 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 14, Q = 86 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 15, Q = 85 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 16, Q = 84 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 17, Q = 83 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 18, Q = 82 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 19, Q = 81 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 20, Q = 80 (0 new), 
Txs: 0 of 1000000.0, New txs: 0, seen addr: 21, Q = 79 (0 new)

ValueError: DataFrame constructor not properly called!