In [2]:
import pandas as pd
import pickle

import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ="/home/tdelatte/new-projects/ethereum-analytics/key/ethereum-analytics-309308-6c01508bc0b8.json"

In [3]:
from google.cloud import bigquery
client = bigquery.Client()

In [4]:
def load_data_from_bigquery(QUERY):
    
    query_job = client.query(QUERY) # API request
    df = query_job.to_dataframe()
    
    return df

In [5]:
QUERY = """

        WITH
          ethereum_balance AS (
          SELECT
            address AS ethereum_address,
            (eth_balance / POWER(10, 18)) AS eth_balance
          FROM
            `bigquery-public-data.crypto_ethereum.balances`
          WHERE
            (eth_balance / POWER(10, 18)) > 100
          LIMIT
            100000),
          
          top_tokens AS (
          SELECT
            token_address,
            COUNT(1) AS transfer_count
          FROM
            `bigquery-public-data.ethereum_blockchain.token_transfers` AS token_transfers
          GROUP BY
            token_address
          ORDER BY
            transfer_count DESC
          LIMIT
            100000),
         
         token_balances AS (
          WITH
            double_entry_book AS (
            SELECT
              token_address,
              to_address AS ethereum_address,
              CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers`
            UNION ALL
            SELECT
              token_address,
              from_address AS ethereum_address,
              -CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers` )
          SELECT
            a.ethereum_address,
            b.token_address,
            SUM(value) AS balance,
            COUNT(DISTINCT transaction_hash) as unique_transfers
          FROM
            ethereum_balance a
          JOIN
            double_entry_book b
          ON
            a.ethereum_address = b.ethereum_address
          JOIN
            top_tokens c
          ON
            c.token_address = b.token_address
          WHERE
            a.ethereum_address != '0x0000000000000000000000000000000000000000'
          GROUP BY
            1,
            2
          HAVING
            balance > 0 )
            
            
        SELECT
          ethereum_address,
          MAX(eth_balance) AS ether_balance,
          COUNT(DISTINCT token_address) AS unique_tokens,
          MAX(unique_transfers) AS unique_transfers
        FROM
          ethereum_balance a
        JOIN
          token_balances b
        USING
          (ethereum_address)
        GROUP BY
          1

    """

In [6]:
eth_dataset = load_data_from_bigquery(QUERY)

In [7]:
eth_dataset.head(10)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers
0,0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,207125.988752,951,422637
1,0x564286362092d8e7936f0549571a803b203aaced,20414.897265,501,109404
2,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,18194.648099,561,105544
3,0x000000000000000000000000000000000000dead,12526.214168,1065,7130
4,0x1062a747393198f70f71ec65a582423dba7e5ab3,305.445934,402,443465
5,0x8d12a197cb00d4747a1fe03395095ce2a5cc6819,17957.839269,9904,117342
6,0x2a0c0dbecc7e4d658f48e01e3fa353f44050c208,19334.010498,1721,54180
7,0xb8001c3ec9aa1985f6c747e25c28324e4a361ec1,1493.333208,56,26962
8,0xeee28d484628d41a82d01e21d12e2e78d69920da,327.132661,360,294350
9,0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98,13683.606121,936,330486


In [10]:
eth_dataset.shape

(26294, 4)

In [8]:
other_features = pickle.load(open("../data/external/df.p", "rb"))

In [9]:
other_features.rename({"address": "ethereum_address"}, axis=1, inplace=True)

In [11]:
eth_dataset_all_features = pd.merge(eth_dataset, other_features, how="inner", on="ethereum_address")

In [12]:
eth_dataset_all_features.head(10)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,outgoing_txns,incoming_txns,total_eth_sent,avg_eth_sent,total_usd_sent,avg_usd_sent,...,monthly_eth_recd,monthly_usd_recd,contracts_created,contract_txns_sent,incoming_avg_time_btwn_txns,incoming_std_time_btwn_txns,outgoing_avg_time_btwn_txns,outgoing_std_time_btwn_txns,num_tokens_used,eth_balance
0,0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,207125.988752,951,422637,1539893,4642263,34038552.260613725,22.104491845,20475490000.0,13296.695869,...,2150256.562206332,1294688000.0,0,0,8.642773,366.106421,26.053777,146.503212,519,299076.571853705
1,0x564286362092d8e7936f0549571a803b203aaced,20414.897265,501,109404,615240,678,5823039.74567252,9.464663783,3502765000.0,5693.331568,...,537952.716734639,322896100.0,0,0,39191.246677,60944.567004,43.327104,1286.006034,261,22316.802499451
2,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,18194.648099,561,105544,647655,727,6159822.80837053,9.510963103,3714534000.0,5735.358548,...,570567.985719681,342511700.0,0,0,36676.988981,53315.681075,41.152388,1296.313,310,28796.708749983
3,0x1062a747393198f70f71ec65a582423dba7e5ab3,305.445934,402,443465,3909,1974,41965.969,10.73573011,12622830.0,3229.171752,...,2218.526085996,661455.2,0,0,22581.909782,382395.273862,11690.055527,159279.799192,215,39.602766588
4,0xeee28d484628d41a82d01e21d12e2e78d69920da,327.132661,360,294350,53893,12809,1767690.47107525,32.800001319,495967800.0,9202.823802,...,97844.079789312,27327010.0,0,0,3342.928873,98345.97843,848.53212,22128.860782,225,279.975975298
5,0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98,13683.606121,936,330486,5488707,235407,25691539.593409784,4.680799976,9921043000.0,1807.53744,...,75253.575290014,507883.5,13,1326134,434.158199,26845.844343,18.729696,406.728531,503,977374.381627778
6,0x0d0707963952f2fba59dd06f2b425ace40b492fe,1289.716674,587,245377,479582,492072,2437018.042751326,5.081546102,1066724000.0,2224.278521,...,314641.354177898,137594300.0,0,0,35.793619,841.981905,36.758222,91.156553,281,47503.015072597
7,0xd551234ae421e3bcba99a0da6d736074f22192ff,18430.043516,544,105056,659575,758,6328579.94042313,9.594936043,3699694000.0,5609.209408,...,584776.092066935,341223700.0,0,0,35160.766182,45676.789368,40.408411,761.332809,290,27673.806295508
8,0x4b01721f0244e7c5b5f63c20942850e447f5a5ee,119.078152,563,32577,344182,132596,233295.913862957,0.677827178,125073200.0,363.392711,...,8234.385503747,4334415.0,0,0,547.629458,20316.274963,210.980914,11439.296393,278,931.285963325
9,0xa30d8157911ef23c46c0eb71889efe6a648a41f7,761.29415,320,31655,51831,2880,604662.82149981,11.666045832,465739800.0,8985.739086,...,42213.917721753,33998810.0,0,0,11318.52935,115429.319359,700.289331,23258.894463,279,8761.294943221


In [13]:
eth_dataset_all_features.drop(["eth_balance"], axis=1, inplace=True)

In [26]:
labels = pd.read_csv("../data/processed/eth_addresses_labels.csv")

In [29]:
labels.Entity.value_counts()

DeFi           1046
Exchange        249
ICO Wallets     161
Mining          108
Dex              81
Name: Entity, dtype: int64

In [32]:
labels = labels.loc[labels["Entity"].isin(["Exchange", "Mining"])]

In [34]:
labels.Entity.value_counts()

Exchange    249
Mining      108
Name: Entity, dtype: int64

In [35]:
labels.rename({'Address': 'ethereum_address'}, axis=1, inplace=True)

In [36]:
labels.shape

(357, 2)

In [37]:
labeled_dataset = pd.merge(eth_dataset_all_features, labels, how="left", on="ethereum_address")

In [38]:
labeled_dataset.Entity.value_counts()

Exchange    52
Mining      11
Name: Entity, dtype: int64

In [39]:
labeled_dataset.head(50)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,outgoing_txns,incoming_txns,total_eth_sent,avg_eth_sent,total_usd_sent,avg_usd_sent,...,monthly_eth_recd,monthly_usd_recd,contracts_created,contract_txns_sent,incoming_avg_time_btwn_txns,incoming_std_time_btwn_txns,outgoing_avg_time_btwn_txns,outgoing_std_time_btwn_txns,num_tokens_used,Entity
0,0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,207125.988752,951,422637,1539893,4642263,34038552.260613725,22.104491845,20475490000.0,13296.7,...,2150256.562206332,1294688000.0,0,0,8.642773,366.1064,26.05378,146.5032,519,Exchange
1,0x564286362092d8e7936f0549571a803b203aaced,20414.897265,501,109404,615240,678,5823039.74567252,9.464663783,3502765000.0,5693.332,...,537952.716734639,322896100.0,0,0,39191.25,60944.57,43.3271,1286.006,261,Exchange
2,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,18194.648099,561,105544,647655,727,6159822.80837053,9.510963103,3714534000.0,5735.359,...,570567.985719681,342511700.0,0,0,36676.99,53315.68,41.15239,1296.313,310,Exchange
3,0x1062a747393198f70f71ec65a582423dba7e5ab3,305.445934,402,443465,3909,1974,41965.969,10.73573011,12622830.0,3229.172,...,2218.526085996,661455.2,0,0,22581.91,382395.3,11690.06,159279.8,215,Exchange
4,0xeee28d484628d41a82d01e21d12e2e78d69920da,327.132661,360,294350,53893,12809,1767690.47107525,32.800001319,495967800.0,9202.824,...,97844.079789312,27327010.0,0,0,3342.929,98345.98,848.5321,22128.86,225,Exchange
5,0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98,13683.606121,936,330486,5488707,235407,25691539.593409784,4.680799976,9921043000.0,1807.537,...,75253.575290014,507883.5,13,1326134,434.1582,26845.84,18.7297,406.7285,503,Exchange
6,0x0d0707963952f2fba59dd06f2b425ace40b492fe,1289.716674,587,245377,479582,492072,2437018.042751326,5.081546102,1066724000.0,2224.279,...,314641.354177898,137594300.0,0,0,35.79362,841.9819,36.75822,91.15655,281,Exchange
7,0xd551234ae421e3bcba99a0da6d736074f22192ff,18430.043516,544,105056,659575,758,6328579.94042313,9.594936043,3699694000.0,5609.209,...,584776.092066935,341223700.0,0,0,35160.77,45676.79,40.40841,761.3328,290,Exchange
8,0x4b01721f0244e7c5b5f63c20942850e447f5a5ee,119.078152,563,32577,344182,132596,233295.913862957,0.677827178,125073200.0,363.3927,...,8234.385503747,4334415.0,0,0,547.6295,20316.27,210.9809,11439.3,278,Exchange
9,0xa30d8157911ef23c46c0eb71889efe6a648a41f7,761.29415,320,31655,51831,2880,604662.82149981,11.666045832,465739800.0,8985.739,...,42213.917721753,33998810.0,0,0,11318.53,115429.3,700.2893,23258.89,279,Exchange


In [40]:
labeled_dataset.shape

(6007, 29)

In [41]:
from etherscan import Etherscan
api_key = os.environ.get("ETHERSCAN_API_KEY")
eth = Etherscan(api_key) # key in quotation marks

In [42]:
eth = Etherscan("UJZSE9DXM1T37JZQVNQYNSS9IT5JEPIBA7") # key in quotation marks

In [43]:
labeled_dataset["mined_blocks"] = 0

In [44]:
labeled_dataset.head()

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,outgoing_txns,incoming_txns,total_eth_sent,avg_eth_sent,total_usd_sent,avg_usd_sent,...,monthly_usd_recd,contracts_created,contract_txns_sent,incoming_avg_time_btwn_txns,incoming_std_time_btwn_txns,outgoing_avg_time_btwn_txns,outgoing_std_time_btwn_txns,num_tokens_used,Entity,mined_blocks
0,0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,207125.988752,951,422637,1539893,4642263,34038552.260613725,22.104491845,20475490000.0,13296.695869,...,1294688000.0,0,0,8.642773,366.106421,26.053777,146.503212,519,Exchange,0
1,0x564286362092d8e7936f0549571a803b203aaced,20414.897265,501,109404,615240,678,5823039.74567252,9.464663783,3502765000.0,5693.331568,...,322896100.0,0,0,39191.246677,60944.567004,43.327104,1286.006034,261,Exchange,0
2,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,18194.648099,561,105544,647655,727,6159822.80837053,9.510963103,3714534000.0,5735.358548,...,342511700.0,0,0,36676.988981,53315.681075,41.152388,1296.313,310,Exchange,0
3,0x1062a747393198f70f71ec65a582423dba7e5ab3,305.445934,402,443465,3909,1974,41965.969,10.73573011,12622830.0,3229.171752,...,661455.2,0,0,22581.909782,382395.273862,11690.055527,159279.799192,215,Exchange,0
4,0xeee28d484628d41a82d01e21d12e2e78d69920da,327.132661,360,294350,53893,12809,1767690.47107525,32.800001319,495967800.0,9202.823802,...,27327010.0,0,0,3342.928873,98345.97843,848.53212,22128.860782,225,Exchange,0


In [45]:
def add_mined_blocks(df):
    for i, row in df.iterrows():
        eth_address = row.ethereum_address
        if i % 500 == 0:
            print(f"We are at the {i}th row!")
        try:
            mined = len(eth.get_mined_blocks_by_address(address=eth_address))
        except:
            continue
        if mined:
            df.iat[i, 4] = mined
    return df

In [46]:
labeled_complete_dataset = add_mined_blocks(labeled_dataset)

We are at the 0th row!
We are at the 500th row!
We are at the 1000th row!
We are at the 1500th row!
We are at the 2000th row!
We are at the 2500th row!
We are at the 3000th row!
We are at the 3500th row!
We are at the 4000th row!
We are at the 4500th row!
We are at the 5000th row!
We are at the 5500th row!
We are at the 6000th row!


In [50]:
labeled_complete_dataset["mined_blocks"].value_counts()

0    6007
Name: mined_blocks, dtype: int64

In [48]:
labeled_complete_dataset.to_csv("../data/processed/labeled_dataset_complete.csv")

In [49]:
labeled_complete_dataset.head(10)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,outgoing_txns,incoming_txns,total_eth_sent,avg_eth_sent,total_usd_sent,avg_usd_sent,...,monthly_usd_recd,contracts_created,contract_txns_sent,incoming_avg_time_btwn_txns,incoming_std_time_btwn_txns,outgoing_avg_time_btwn_txns,outgoing_std_time_btwn_txns,num_tokens_used,Entity,mined_blocks
0,0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,207125.988752,951,422637,1539893,4642263,34038552.260613725,22.104491845,20475490000.0,13296.695869,...,1294688000.0,0,0,8.642773,366.106421,26.053777,146.503212,519,Exchange,0
1,0x564286362092d8e7936f0549571a803b203aaced,20414.897265,501,109404,615240,678,5823039.74567252,9.464663783,3502765000.0,5693.331568,...,322896100.0,0,0,39191.246677,60944.567004,43.327104,1286.006034,261,Exchange,0
2,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,18194.648099,561,105544,647655,727,6159822.80837053,9.510963103,3714534000.0,5735.358548,...,342511700.0,0,0,36676.988981,53315.681075,41.152388,1296.313,310,Exchange,0
3,0x1062a747393198f70f71ec65a582423dba7e5ab3,305.445934,402,443465,3909,1974,41965.969,10.73573011,12622830.0,3229.171752,...,661455.2,0,0,22581.909782,382395.273862,11690.055527,159279.799192,215,Exchange,0
4,0xeee28d484628d41a82d01e21d12e2e78d69920da,327.132661,360,294350,53893,12809,1767690.47107525,32.800001319,495967800.0,9202.823802,...,27327010.0,0,0,3342.928873,98345.97843,848.53212,22128.860782,225,Exchange,0
5,0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98,13683.606121,936,330486,5488707,235407,25691539.593409784,4.680799976,9921043000.0,1807.53744,...,507883.5,13,1326134,434.158199,26845.844343,18.729696,406.728531,503,Exchange,0
6,0x0d0707963952f2fba59dd06f2b425ace40b492fe,1289.716674,587,245377,479582,492072,2437018.042751326,5.081546102,1066724000.0,2224.278521,...,137594300.0,0,0,35.793619,841.981905,36.758222,91.156553,281,Exchange,0
7,0xd551234ae421e3bcba99a0da6d736074f22192ff,18430.043516,544,105056,659575,758,6328579.94042313,9.594936043,3699694000.0,5609.209408,...,341223700.0,0,0,35160.766182,45676.789368,40.408411,761.332809,290,Exchange,0
8,0x4b01721f0244e7c5b5f63c20942850e447f5a5ee,119.078152,563,32577,344182,132596,233295.913862957,0.677827178,125073200.0,363.392711,...,4334415.0,0,0,547.629458,20316.274963,210.980914,11439.296393,278,Exchange,0
9,0xa30d8157911ef23c46c0eb71889efe6a648a41f7,761.29415,320,31655,51831,2880,604662.82149981,11.666045832,465739800.0,8985.739086,...,33998810.0,0,0,11318.52935,115429.319359,700.289331,23258.894463,279,Exchange,0


In [27]:
labeled_complete_dataset.shape

(26529, 5)

In [14]:
# Sanity check: do eth_addresses labeled as "Miners" actually have mined_blocks

In [35]:
eth_dataset_complete["Entity"] = "Unknown"

In [39]:
eth_dataset_complete.drop(["Label"], axis=1)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks,Entity
0,0x8d12a197cb00d4747a1fe03395095ce2a5cc6819,17960.597685,9900,117342,0,Unknown
1,0x2a0c0dbecc7e4d658f48e01e3fa353f44050c208,19345.149872,1718,54180,0,Unknown
2,0x0d0707963952f2fba59dd06f2b425ace40b492fe,443.735122,587,245377,0,Unknown
3,0xab5c66752a9e8167967685f1450532fb96d5d24f,331.840176,402,443233,0,Unknown
4,0x1ce7ae555139c5ef5a57cc8d814a867ee6ee33d8,747.701162,978,18360,0,Unknown
...,...,...,...,...,...,...
26524,0x4d5a77b869312a8e2b3daa67c01c59753f6254ae,281.630258,47,12,0,Unknown
26525,0xd2ad5f590192d227cb7e6ff15743119fe26118a0,1000.954777,47,4,0,Unknown
26526,0x603f39c81560019c8360f33ba45bc1e4caecb33e,176.012588,47,7,0,Unknown
26527,0x003e93083a2d294cb8c4421048108330c37b5874,669.846139,47,2,0,Unknown


In [45]:
eth_dataset_complete = eth_dataset_complete.drop(["Label"], axis=1)

In [48]:
eth_dataset_complete.shape

(26529, 6)

In [57]:
labeled_dataset.shape

(19137, 2)

In [59]:
labeled_dataset_complete.to_csv("../data/processed/labeled_dataset_complete.csv")