In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ="/home/tdelatte/new-projects/ethereum-analytics/key/ethereum-analytics-309308-6c01508bc0b8.json"

In [2]:
from google.cloud import bigquery
client = bigquery.Client()

In [3]:
import pandas as pd

In [4]:
def load_data_from_bigquery(QUERY):
    
    query_job = client.query(QUERY) # API request
    df = query_job.to_dataframe()
    
    return df

In [6]:
QUERY = """

        WITH
          ethereum_balance AS (
          SELECT
            address AS ethereum_address,
            (eth_balance / POWER(10, 18)) AS eth_balance
          FROM
            `bigquery-public-data.crypto_ethereum.balances`
          WHERE
            (eth_balance / POWER(10, 18)) > 100
          LIMIT
            100000),
          
          top_tokens AS (
          SELECT
            token_address,
            COUNT(1) AS transfer_count
          FROM
            `bigquery-public-data.ethereum_blockchain.token_transfers` AS token_transfers
          GROUP BY
            token_address
          ORDER BY
            transfer_count DESC
          LIMIT
            100000),
         
         token_balances AS (
          WITH
            double_entry_book AS (
            SELECT
              token_address,
              to_address AS ethereum_address,
              CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers`
            UNION ALL
            SELECT
              token_address,
              from_address AS ethereum_address,
              -CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers` )
          SELECT
            a.ethereum_address,
            b.token_address,
            SUM(value) AS balance,
            COUNT(DISTINCT transaction_hash) as unique_transfers
          FROM
            ethereum_balance a
          JOIN
            double_entry_book b
          ON
            a.ethereum_address = b.ethereum_address
          JOIN
            top_tokens c
          ON
            c.token_address = b.token_address
          WHERE
            a.ethereum_address != '0x0000000000000000000000000000000000000000'
          GROUP BY
            1,
            2
          HAVING
            balance > 0 )
            
            
        SELECT
          ethereum_address,
          MAX(eth_balance) AS ether_balance,
          COUNT(DISTINCT token_address) AS unique_tokens,
          MAX(unique_transfers) AS unique_transfers
        FROM
          ethereum_balance a
        JOIN
          token_balances b
        USING
          (ethereum_address)
        GROUP BY
          1

    """

In [7]:
eth_dataset = load_data_from_bigquery(QUERY)

In [8]:
eth_dataset.head(10)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers
0,0x6e9b2bfda8872bbf89773e78b9be39430f9a6d06,2.472982,3,2
1,0xb128b09799ec75df861de33110b724e93a77d366,4.9088,1,1
2,0x616c393cd7d6e0690c4f7fc0ca829b8cc61bc2bd,1.000846,36,2
3,0x8d618ceca1ccce08377ba17c2bc041401c209733,8.91308,9,1
4,0x4a42ca8148527d989ab962bba0a04c7e3c7fe2fd,1.020033,5,15
5,0x19b1b65f1a3f7d073c13c327ae44faa87a139dc5,31.07349,22,14
6,0x2963308fd8aa8677fa916b13b661a5e716f9c6aa,1.823519,1,1
7,0x3ebf2c174da33333d5bf9dfa40a68b0d35977392,23.672707,3,1
8,0xc568719dd395e3b3e55b7ccf7178e323698ccb99,2.999139,6,1
9,0x137dc727d786e569a29d8af5ed2ee1e414e3edb6,1.1,6,1


In [9]:
from etherscan import Etherscan
api_key = os.environ.get("ETHERSCAN_API_KEY")
eth = Etherscan(api_key) # key in quotation marks

In [10]:
eth = Etherscan("UJZSE9DXM1T37JZQVNQYNSS9IT5JEPIBA7") # key in quotation marks

In [11]:
eth_dataset["mined_blocks"] = 0

In [12]:
eth_dataset.head()

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks
0,0x6e9b2bfda8872bbf89773e78b9be39430f9a6d06,2.472982,3,2,0
1,0xb128b09799ec75df861de33110b724e93a77d366,4.9088,1,1,0
2,0x616c393cd7d6e0690c4f7fc0ca829b8cc61bc2bd,1.000846,36,2,0
3,0x8d618ceca1ccce08377ba17c2bc041401c209733,8.91308,9,1,0
4,0x4a42ca8148527d989ab962bba0a04c7e3c7fe2fd,1.020033,5,15,0


In [36]:
dataset_sample = eth_dataset[:10]

In [37]:
dataset_sample.head()

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks
0,0x6e9b2bfda8872bbf89773e78b9be39430f9a6d06,2.472982,3,2,0
1,0xb128b09799ec75df861de33110b724e93a77d366,4.9088,1,1,0
2,0x616c393cd7d6e0690c4f7fc0ca829b8cc61bc2bd,1.000846,36,2,0
3,0x8d618ceca1ccce08377ba17c2bc041401c209733,8.91308,9,1,0
4,0x4a42ca8148527d989ab962bba0a04c7e3c7fe2fd,1.020033,5,15,0


In [35]:
def add_mined_blocks(df):
    for i, row in df.iterrows():
        eth_address = row.ethereum_address
        try:
            mined = len(eth.get_mined_blocks_by_address(address=eth_address))
        except:
            continue
        if mined:
            row["mined_blocks"] += mined
        return df

In [41]:
add_mined_blocks(eth_dataset)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks
0,0x6e9b2bfda8872bbf89773e78b9be39430f9a6d06,2.472982,3,2,0
1,0xb128b09799ec75df861de33110b724e93a77d366,4.908800,1,1,0
2,0x616c393cd7d6e0690c4f7fc0ca829b8cc61bc2bd,1.000846,36,2,0
3,0x8d618ceca1ccce08377ba17c2bc041401c209733,8.913080,9,1,0
4,0x4a42ca8148527d989ab962bba0a04c7e3c7fe2fd,1.020033,5,15,0
...,...,...,...,...,...
4583,0x67d53445193368ed9131bf8c04f4b9cd9c9a94c0,8.079077,6,1,0
4584,0x9b6cfd7bafcd5da386686e950c8282ab5f26a1fc,2.444885,3,1,0
4585,0xc1138fa3f2c059feb57a668ee10e7e83f885cf22,36.858552,10,1,0
4586,0x0d189327d350dc81982a4793ad9fcf2389e40185,19.993280,3,12,0


In [42]:
eth_dataset.head(10)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks
0,0x6e9b2bfda8872bbf89773e78b9be39430f9a6d06,2.472982,3,2,0
1,0xb128b09799ec75df861de33110b724e93a77d366,4.9088,1,1,0
2,0x616c393cd7d6e0690c4f7fc0ca829b8cc61bc2bd,1.000846,36,2,0
3,0x8d618ceca1ccce08377ba17c2bc041401c209733,8.91308,9,1,0
4,0x4a42ca8148527d989ab962bba0a04c7e3c7fe2fd,1.020033,5,15,0
5,0x19b1b65f1a3f7d073c13c327ae44faa87a139dc5,31.07349,22,14,0
6,0x2963308fd8aa8677fa916b13b661a5e716f9c6aa,1.823519,1,1,0
7,0x3ebf2c174da33333d5bf9dfa40a68b0d35977392,23.672707,3,1,0
8,0xc568719dd395e3b3e55b7ccf7178e323698ccb99,2.999139,6,1,0
9,0x137dc727d786e569a29d8af5ed2ee1e414e3edb6,1.1,6,1,0


In [44]:
eth_dataset["mined_blocks"].value_counts()

0    4588
Name: mined_blocks, dtype: int64

In [None]:
labeled_dataset = pd.read_csv("../data/external/eth_addresses.csv")

In [None]:
# Sanity check: do eth_addresses labeled as "Miners" actually have mined_blocks != 0 ?



In [None]:
labeled_dataset = labeled_dataset[labeled_dataset.Entity != "Entity"]

In [None]:
labels.rename({'Address': 'ethereum_address'}, axis=1, inplace=True)

In [None]:
ethdataset["Label"] = "Unknown"

In [None]:
labeled_dataset = pd.merge(eth_dataset, labels)

In [None]:
labeled_dataset.Entity.value_counts()

In [None]:
labeled_dataset.head(20)

In [None]:
# Manually add crowdsourced labels from etherscan.
# Cannot scrape data from Etherscan: CloudFare protection.

