In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

import csv
import re

In [5]:
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager

# ** Chrome
# driver = webdriver.Chrome(ChromeDriverManager().install()) # If you don't have the chromedriver installed, it will install it for you
# driver = webdriver.Chrome() # If you have the chromedriver installed, you can use this line


# ** Firefox
# driver = webdriver.Firefox(GeckoDriverManager().install()) # If you don't have the geckodriver installed, it will install it for you
driver = webdriver.Firefox() # If you have the geckodriver installed, you can use this line

In [4]:
SCRAPE_URL = "https://etherscan.io/txs"
SCRAPE_BLOCKS = 10
TRANSACTIONS_CSV = "../data/transactions.csv"
TRANSACTIONS_CLEANED_CSV = "../data/transactions_cleaned.csv"
SAMPLE_PROPORTION = 0.1
RANDOM_SEED = 42

In [6]:
driver.get(SCRAPE_URL)

In [4]:
def parse_transactions(page_source: str) -> list[dict[str, str]]:
    soup = BeautifulSoup(page_source, "html.parser")
    table = soup.find("table", {"class": "table table-hover table-align-middle mb-0"})
    body = table.find("tbody")
    rows = body.find_all("tr")
    transactions : list[dict[str, str]] = []
    for row in rows:
        cells = row.find_all("td")
        if len(cells) > 0:
            transaction = {
                "Txn Hash": cells[1].text.strip(),
                "Method": cells[2].text.strip(),
                "Block": cells[3].text.strip(),
                "Timestamp": cells[4].text.strip(),
                "Age": cells[5].text.strip(),
                "From": cells[7].text.strip(),
                "To": cells[9].text.strip(),
                "Value": cells[10].text.strip(),
                "Txn Fee": cells[11].text.strip(),
            }
            transactions.append(transaction)
    return transactions

In [7]:
last_block = None
first_block = None
transactions : list[dict[str, str]] = []

while True:
    transactions += parse_transactions(driver.page_source)

    if first_block is None:
        last_block = int(transactions[0]["Block"])
        first_block = last_block - SCRAPE_BLOCKS + 1

    if int(transactions[-1]["Block"]) < first_block:
        break

    driver.find_element(By.LINK_TEXT, "Next").click()

driver.close()

In [17]:
print(f"Scraped {len(transactions)} transactions from blocks {first_block} to {last_block}")
print(transactions[0])
print(transactions[-1])

Scraped 4700 transactions from blocks 19334507 to 19334516
{'Txn Hash': '0x87ad207685a04a5630df4dc60f691b3e6679896f1def7d3c5a8af8600be9e1c0', 'Method': 'Sell To Uniswap', 'Block': '19334516', 'Timestamp': '2024-02-29 16:59:47', 'Age': '11 secs ago', 'From': '0xD11c274E...100D7A541', 'To': '0x: Exchange Proxy', 'Value': '0.581130968 ETH', 'Txn Fee': '0.00967144'}
{'Txn Hash': '0xd57fcd1d504476c3d9daad2f5cacb5fb142aba92c9b71a1c54cdcc6f2f14e6f0', 'Method': 'Transfer', 'Block': '19334506', 'Timestamp': '2024-02-29 16:57:47', 'Age': '5 mins ago', 'From': '0x3E56a0C8...15B4f89D0', 'To': 'Tether: USDT Stablecoin', 'Value': '0 ETH', 'Txn Fee': '0.00316241'}


In [10]:
with open(TRANSACTIONS_CSV, "w", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=transactions[0].keys(), lineterminator='\n')
    writer.writeheader()
    writer.writerows(transactions)