In [2]:
import pandas as pd
import pickle

import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ="/home/tdelatte/new-projects/ethereum-analytics/key/ethereum-analytics-309308-6c01508bc0b8.json"

In [3]:
from google.cloud import bigquery
client = bigquery.Client()

In [4]:
def load_data_from_bigquery(QUERY):
    
    query_job = client.query(QUERY) # API request
    df = query_job.to_dataframe()
    
    return df

In [5]:
QUERY = """

        WITH
          ethereum_balance AS (
          SELECT
            address AS ethereum_address,
            (eth_balance / POWER(10, 18)) AS eth_balance
          FROM
            `bigquery-public-data.crypto_ethereum.balances`
          WHERE
            (eth_balance / POWER(10, 18)) > 100
          LIMIT
            100000),
          
          top_tokens AS (
          SELECT
            token_address,
            COUNT(1) AS transfer_count
          FROM
            `bigquery-public-data.ethereum_blockchain.token_transfers` AS token_transfers
          GROUP BY
            token_address
          ORDER BY
            transfer_count DESC
          LIMIT
            100000),
         
         token_balances AS (
          WITH
            double_entry_book AS (
            SELECT
              token_address,
              to_address AS ethereum_address,
              CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers`
            UNION ALL
            SELECT
              token_address,
              from_address AS ethereum_address,
              -CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers` )
          SELECT
            a.ethereum_address,
            b.token_address,
            SUM(value) AS balance,
            COUNT(DISTINCT transaction_hash) as unique_transfers
          FROM
            ethereum_balance a
          JOIN
            double_entry_book b
          ON
            a.ethereum_address = b.ethereum_address
          JOIN
            top_tokens c
          ON
            c.token_address = b.token_address
          WHERE
            a.ethereum_address != '0x0000000000000000000000000000000000000000'
          GROUP BY
            1,
            2
          HAVING
            balance > 0 )
            
            
        SELECT
          ethereum_address,
          MAX(eth_balance) AS ether_balance,
          COUNT(DISTINCT token_address) AS unique_tokens,
          MAX(unique_transfers) AS unique_transfers
        FROM
          ethereum_balance a
        JOIN
          token_balances b
        USING
          (ethereum_address)
        GROUP BY
          1

    """

In [6]:
eth_dataset = load_data_from_bigquery(QUERY)

In [7]:
eth_dataset.head(10)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers
0,0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,207125.988752,951,422637
1,0x564286362092d8e7936f0549571a803b203aaced,20414.897265,501,109404
2,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,18194.648099,561,105544
3,0x000000000000000000000000000000000000dead,12526.214168,1065,7130
4,0x1062a747393198f70f71ec65a582423dba7e5ab3,305.445934,402,443465
5,0x8d12a197cb00d4747a1fe03395095ce2a5cc6819,17957.839269,9904,117342
6,0x2a0c0dbecc7e4d658f48e01e3fa353f44050c208,19334.010498,1721,54180
7,0xb8001c3ec9aa1985f6c747e25c28324e4a361ec1,1493.333208,56,26962
8,0xeee28d484628d41a82d01e21d12e2e78d69920da,327.132661,360,294350
9,0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98,13683.606121,936,330486


In [41]:
from etherscan import Etherscan
api_key = os.environ.get("ETHERSCAN_API_KEY")
eth = Etherscan(api_key) # key in quotation marks
eth = Etherscan("UJZSE9DXM1T37JZQVNQYNSS9IT5JEPIBA7") # key in quotation marks

In [43]:
eth_dataset["mined_blocks"] = 0

In [45]:
def add_mined_blocks(df):
    for i, row in df.iterrows():
        eth_address = row.ethereum_address
        if i % 500 == 0:
            print(f"We are at the {i}th row!")
        try:
            mined = len(eth.get_mined_blocks_by_address(address=eth_address))
        except:
            continue
        if mined:
            df.iat[i, 4] = mined
    return df

In [46]:
eth_dataset = add_mined_blocks(eth_dataset)

We are at the 0th row!
We are at the 500th row!
We are at the 1000th row!
We are at the 1500th row!
We are at the 2000th row!
We are at the 2500th row!
We are at the 3000th row!
We are at the 3500th row!
We are at the 4000th row!
We are at the 4500th row!
We are at the 5000th row!
We are at the 5500th row!
We are at the 6000th row!


In [48]:
eth_dataset.to_csv("../data/processed/eth_dataset.csv")

In [51]:
eth_dataset = pd.read_csv("../data/processed/eth_dataset.csv")

In [52]:
eth_dataset.head()

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks
0,0x8d12a197cb00d4747a1fe03395095ce2a5cc6819,17960.597685,9900,117342,0
1,0x2a0c0dbecc7e4d658f48e01e3fa353f44050c208,19345.149872,1718,54180,0
2,0x0d0707963952f2fba59dd06f2b425ace40b492fe,443.735122,587,245377,0
3,0xab5c66752a9e8167967685f1450532fb96d5d24f,331.840176,402,443233,0
4,0x1ce7ae555139c5ef5a57cc8d814a867ee6ee33d8,747.701162,978,18360,0


In [53]:
other_features = pickle.load(open("../data/external/df.p", "rb"))

In [55]:
other_features.rename({"address": "ethereum_address"}, axis=1, inplace=True)

In [56]:
other_features.drop(["eth_balance"], axis=1, inplace=True)

In [57]:
other_features.head()

Unnamed: 0,ethereum_address,outgoing_txns,incoming_txns,total_eth_sent,avg_eth_sent,total_usd_sent,avg_usd_sent,total_eth_recd,avg_eth_recd,total_usd_recd,...,monthly_usd_sent,monthly_eth_recd,monthly_usd_recd,contracts_created,contract_txns_sent,incoming_avg_time_btwn_txns,incoming_std_time_btwn_txns,outgoing_avg_time_btwn_txns,outgoing_std_time_btwn_txns,num_tokens_used
0,0xd26a4d3ce34eef62a5eacc1f07b6e4ed11d0d516,256,151,19.7749257,0.077245804,6419.212,25.075046,77.681603434,0.514447705,18931.96,...,917.0303,11.097371919,2704.566,0,0,100093.9,254570.169509,59336.572549,213155.856128,30
1,0x5730d1ea8624b745dde30e57d7f6e8f83fcec98a,256,56,3651.334520803,14.263025472,1607194.0,6278.101788,2706.690819018,48.333764625,1378033.0,...,94540.83,159.217107001,81060.79,0,0,639384.018182,857951.59299,156710.439216,341322.129482,24
2,0x1efc1e054f14aa9ab69e71c8f181d4b8f4005735,256,104,14800.887683358,57.815967513,7144330.0,27907.538523,16843.009974694,161.952018987,7774533.0,...,420254.7,990.765292629,457325.4,0,0,208644.184466,621665.577772,102012.976471,573323.712422,27
3,0xfd3a935174aeb79b8d5d3935de1188e37427561f,768,892,23397.465171236,30.465449442,146331.0,190.535165,27432.50932407,30.75393422,186364.6,...,3954.892,741.419170921,5036.881,3,116,69598.839506,880138.733789,33976.006519,129262.257907,11
4,0xf27b5cf6d40531556f6e6eeb445a8c88c8f9815f,1024,9860,91026.755918763,88.893316327,53629610.0,52372.662163,92880.306328356,9.419909364,55398820.0,...,4125354.0,7144.638948335,4261448.0,0,0,3014.540521,39706.386044,28969.567937,199625.182527,15


In [58]:
eth_dataset_all_features = pd.merge(eth_dataset, other_features, how="inner", on="ethereum_address")

In [59]:
eth_dataset_all_features.head(10)

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks,outgoing_txns,incoming_txns,total_eth_sent,avg_eth_sent,total_usd_sent,...,monthly_usd_sent,monthly_eth_recd,monthly_usd_recd,contracts_created,contract_txns_sent,incoming_avg_time_btwn_txns,incoming_std_time_btwn_txns,outgoing_avg_time_btwn_txns,outgoing_std_time_btwn_txns,num_tokens_used
0,0x0d0707963952f2fba59dd06f2b425ace40b492fe,443.735122,587,245377,0,479582,492072,2437018.042751326,5.081546102,1066724000.0,...,133340500.0,314641.354177898,137594300.0,0,0,35.793619,841.9819,36.75822,91.15655,281
1,0x6cc5f688a315f3dc28a7781717a9a798a59fda7b,1031.186386,865,472190,0,392467,312401,5418637.39363095,13.806606399,2131017000.0,...,213101700.0,556867.74055041,225075700.0,0,0,77.365013,6066.634,53.00636,469.834,341
2,0x564286362092d8e7936f0549571a803b203aaced,23892.712593,502,109404,0,615240,678,5823039.74567252,9.464663783,3502765000.0,...,318433200.0,537952.716734639,322896100.0,0,0,39191.246677,60944.57,43.3271,1286.006,261
3,0x0016eccecffc25b94050187017eb59fa05c029aa,126.407467,54,6180,0,2998,481,4479.533394411,1.494173914,1205682.0,...,172240.3,745.774408214,251683.2,0,0,32761.102083,116946.6,5212.212,19082.18,40
4,0xbe708d227f6dfa0b8f2698bf543b949dfe4e28fb,269.029806,202,1462,0,10164,243,20771.819829851,2.043665863,4969486.0,...,621185.8,38.314152287,13701.81,0,0,77338.57438,178129.8,1845.206,6596.558,166
5,0x9b77ab003d44b9b9cb47fa6a00276a23c05b49a5,2089.859796,54,3,0,5,108,60.85,12.17,10071.31,...,335.7105,68.014756625,24265.93,0,0,687237.691589,2316083.0,7132346.0,8472997.0,32
6,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,11252.425439,561,105544,0,647655,727,6159822.80837053,9.510963103,3714534000.0,...,337684900.0,570567.985719681,342511700.0,0,0,36676.988981,53315.68,41.15239,1296.313,310
7,0x1062a747393198f70f71ec65a582423dba7e5ab3,326.124385,404,443465,0,3909,1974,41965.969,10.73573011,12622830.0,...,664359.6,2218.526085996,661455.2,0,0,22581.909782,382395.3,11690.06,159279.8,215
8,0xeee28d484628d41a82d01e21d12e2e78d69920da,347.253282,357,294350,0,53893,12809,1767690.47107525,32.800001319,495967800.0,...,26103570.0,97844.079789312,27327010.0,0,0,3342.928873,98345.98,848.5321,22128.86,225
9,0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98,16018.851241,939,330486,0,5488707,235407,25691539.593409784,4.680799976,9921043000.0,...,248026100.0,75253.575290014,507883.5,13,1326134,434.158199,26845.84,18.7297,406.7285,503


In [60]:
eth_dataset_all_features.shape

(6057, 29)

In [61]:
eth_dataset_all_features.to_csv("../data/processed/eth_dataset_all_features.csv")

In [62]:
labels = pd.read_csv("../data/processed/eth_addresses_labels.csv")

In [63]:
labels.Entity.value_counts()

DeFi           1046
Exchange        249
ICO Wallets     161
Mining          108
Dex              81
Name: Entity, dtype: int64

In [64]:
labels = labels.loc[labels["Entity"].isin(["Exchange", "Mining"])]

In [65]:
labels.Entity.value_counts()

Exchange    249
Mining      108
Name: Entity, dtype: int64

In [66]:
labels.rename({'Address': 'ethereum_address'}, axis=1, inplace=True)

In [67]:
labels.shape

(357, 2)

In [68]:
labeled_dataset = pd.merge(eth_dataset_all_features, labels, how="left", on="ethereum_address")

In [71]:
# Sanity check: do eth_addresses labeled as "Miners" actually have mined_blocks
labeled_dataset[labeled_dataset["Entity"] == "Mining"]

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers,mined_blocks,outgoing_txns,incoming_txns,total_eth_sent,avg_eth_sent,total_usd_sent,...,monthly_eth_recd,monthly_usd_recd,contracts_created,contract_txns_sent,incoming_avg_time_btwn_txns,incoming_std_time_btwn_txns,outgoing_avg_time_btwn_txns,outgoing_std_time_btwn_txns,num_tokens_used,Entity
115,0x52bc44d5378309ee2abf1539bf71de1b7d7be3b5,2767.258964,168,35,1167990,9785688,368,3291571.27333116,0.336365851,685087900.0,...,12.923259944,482.5421,0,0,277029.1,523923.0,10.423864,383.733773,82,Mining
187,0x829bd824b016326a401d083b33d092293333a830,7768.269522,121,40,1125443,5984353,4050,1875938.077130069,0.313473834,940635000.0,...,367.38322691,129350.9,43,639507,10176.63,127707.2,7.319252,700.582907,81,Mining
455,0xea674fdde714fd979de3edf0f56aa9716b898ec8,1063.450229,160,80,2385967,15678630,359,4634177.920104296,0.295572886,1335740000.0,...,2797.628544548,1903793.0,0,0,222955.6,715767.1,5.31971,39.078584,80,Mining
644,0x2a65aca4d5fc5b5c859090a6c34d164135398226,2344.313238,97,10,940219,3713483,28,4397286.82471517,1.184140825,229450100.0,...,154.664224767,21580.24,0,0,3225466.0,3814541.0,26.752539,456.952387,53,Mining
2691,0xeea5b82b61424df8020f5fedd81767f2d0d25bfb,934.258001,16,2,44999,2839,4,3382.781790629,1.191539905,751800.1,...,150.0,43165.5,0,0,1609701.0,2785560.0,2899.110994,15381.42107,2,Mining
2807,0x63a9975ba31b0b9626b34300f7f627147df1f526,332.567136,17,1,55651,272369,56,311171.25462329,1.142462081,1086351.0,...,411.53648586,389.3337,0,0,156703.3,670807.6,92.767076,1533.796694,10,Mining
3166,0x04668ec2f57cc15c381b461b9fedab5d451c8f7f,4251.593124,18,10,259598,1821,8,1317.784856895,0.723659998,276241.5,...,37.332666667,7861.293,0,0,510037.7,407912.6,3028.657143,15230.350982,1,Mining
4198,0x99c85bb64564d9ef9a99621301f22c9993cb89e3,1063.295273,25,2,61805,20646,7,8636.52463117,0.418314668,2654729.0,...,3.410722778,1078.861,0,0,3233460.0,4309138.0,947.892516,9015.707835,9,Mining
5436,0x00192fb10df37c9fb26829eb2cc623cd1bf599e8,1053.013086,39,4,54078,213100,4,15110.188501737,0.070906563,6214369.0,...,2.178793321,1977.572,0,0,3858662.0,2920099.0,121.435089,1271.616659,19,Mining
5746,0x4bb96091ee9d802ed039c4d1a5f6216f90f81b01,116.998051,42,3,282304,282066,29,1343834.80451828,4.764256608,91038810.0,...,0.773502665,28.0813,0,0,3261389.0,4797708.0,351.099654,649.278458,26,Mining


In [72]:
labeled_dataset.shape

(6057, 30)

In [73]:
labeled_dataset.to_csv("../data/processed/labeled_dataset.csv")