In [2]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ="/home/tdelatte/new-projects/ethereum-analytics/key/ethereum-analytics-309308-6c01508bc0b8.json"

from google.cloud import bigquery
client = bigquery.Client()

import pandas as pd
import pandas_gbq

In [3]:
query = f"""
        WITH
          -- current ethereum balance for 1000 non-zero balance addresses
          ethereum_balance AS (
          SELECT
            address AS ethereum_address,
            (eth_balance / POWER(10, 18)) AS eth_balance
          FROM
            `bigquery-public-data.crypto_ethereum.balances`
          WHERE
            (eth_balance / POWER(10, 18)) > 10
          LIMIT
            1000),
          top_tokens AS (
          SELECT
            token_address,
            COUNT(1) AS transfer_count
          FROM
            `bigquery-public-data.ethereum_blockchain.token_transfers` AS token_transfers
          GROUP BY
            token_address
          ORDER BY
            transfer_count DESC
          LIMIT
            1000 ),
          token_balances AS (
          WITH
            double_entry_book AS (
            SELECT
              token_address,
              to_address AS ethereum_address,
              CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers`
            UNION ALL
            SELECT
              token_address,
              from_address AS ethereum_address,
              -CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers` )
          SELECT
            a.ethereum_address,
            b.token_address,
            SUM(value) AS balance,
            COUNT(DISTINCT transaction_hash) as unique_transfers
          FROM
            ethereum_balance a
          JOIN
            double_entry_book b
          ON
            a.ethereum_address = b.ethereum_address
          JOIN
            top_tokens c
          ON
            c.token_address = b.token_address
          WHERE
            a.ethereum_address != '0x0000000000000000000000000000000000000000'
          GROUP BY
            1,
            2
          HAVING
            balance > 0 )
        SELECT
          ethereum_address,
          MAX(eth_balance) AS ether_balance,
          COUNT(DISTINCT token_address) AS unique_tokens,
          MAX(unique_transfers) as unique_transfers
        FROM
          ethereum_balance a
        JOIN
          token_balances b
        USING
          (ethereum_address)
        GROUP BY
          1
    """

In [15]:
QUERY_2 = f"""


        WITH
          -- current ethereum balance for 1000 non-zero balance addresses
          ethereum_balance AS (
          SELECT
            address AS ethereum_address,
            (eth_balance / POWER(10, 18)) AS eth_balance
          FROM
            `bigquery-public-data.crypto_ethereum.balances`
          WHERE
            (eth_balance / POWER(10, 18)) > 1000
          LIMIT
            10000),
          top_tokens AS (
          SELECT
            token_address,
            COUNT(1) AS transfer_count
          FROM
            `bigquery-public-data.ethereum_blockchain.token_transfers` AS token_transfers
          GROUP BY
            token_address
          ORDER BY
            transfer_count DESC
          LIMIT
            10000 ),
          token_balances AS (
          WITH
            double_entry_book AS (
            SELECT
              token_address,
              to_address AS ethereum_address,
              CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers`
            UNION ALL
            SELECT
              token_address,
              from_address AS ethereum_address,
              -CAST(value AS float64) AS value,
              block_timestamp,
              transaction_hash
            FROM
              `bigquery-public-data.ethereum_blockchain.token_transfers` )
          SELECT
            a.ethereum_address,
            b.token_address,
            SUM(value) AS balance,
            COUNT(DISTINCT transaction_hash) as unique_transfers
          FROM
            ethereum_balance a
          JOIN
            double_entry_book b
          ON
            a.ethereum_address = b.ethereum_address
          JOIN
            top_tokens c
          ON
            c.token_address = b.token_address
          WHERE
            a.ethereum_address != '0x0000000000000000000000000000000000000000'
          GROUP BY
            1,
            2
          HAVING
            balance > 0 )
        SELECT
          ethereum_address,
          MAX(eth_balance) AS ether_balance,
          COUNT(DISTINCT token_address) AS unique_tokens,
          MAX(unique_transfers) as unique_transfers
        FROM
          ethereum_balance a
        JOIN
          token_balances b
        USING
          (ethereum_address)
        GROUP BY
          1

    """

In [16]:
def load_data_from_bigquery(query):
    
    query_job = client.query(query) # API request
    df = query_job.to_dataframe()
    
    return df

In [17]:
train_x_raw = load_data_from_bigquery(QUERY_2)

In [30]:
train_x_raw.shape

(4330, 4)

In [32]:
train_x_raw.head()

Unnamed: 0,ethereum_address,ether_balance,unique_tokens,unique_transfers
0,0x8d12a197cb00d4747a1fe03395095ce2a5cc6819,17959.840523,3935,117342
1,0x2a0c0dbecc7e4d658f48e01e3fa353f44050c208,19367.237137,1511,54180
2,0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98,15640.918064,759,330486
3,0xa12431d0b9db640034b0cdfceef9cce161e62be4,1027.618377,734,10548
4,0x6262998ced04146fa42253a5c0af90ca02dfd2a3,112453.183254,78,11869


In [34]:
train_x_raw.to_csv("../data/raw/train_x_raw.csv")

In [26]:
QUERY_tx_count = ''' SELECT contracts.address, COUNT(1) AS tx_count
            FROM `bigquery-public-data.crypto_ethereum.contracts` AS contracts
            JOIN `bigquery-public-data.crypto_ethereum.transactions` AS transactions ON (transactions.to_address = contracts.address)
            GROUP BY contracts.address
            ORDER BY tx_count DESC
            LIMIT 10 '''

In [27]:
dfe = load_data_from_bigquery(QUERY_tx_count)

In [None]:
other_query = 
'''WITH tot AS
    (WITH subset AS(SELECT *
                FROM `bigquery-public-data.ethereum_blockchain.transactions`
                ORDER BY block_timestamp DESC
                LIMIT 20000
                   ),

    f AS(SELECT DISTINCT from_address, count(*) as num_outgoing_txns
        FROM subset
        GROUP BY from_address),
    t AS(SELECT DISTINCT to_address, count(*) as num_incoming_txns
        FROM subset
        GROUP BY to_address)
    SELECT COALESCE(from_address,to_address) as addr,
            COALESCE(num_outgoing_txns,0) as outgoing_txns,
            COALESCE(num_incoming_txns,0) as incoming_txns
    FROM f FULL JOIN t
    ON f.from_address = t.to_address)

SELECT *, (outgoing_txns + incoming_txns) as total_txns
FROM tot
ORDER BY total_txns desc
LIMIT 50'''

In [12]:
# GCP - BigQuery
# load the service account credentials 
data_location = "/home/tdelatte/new-projects/ethereum-analytics/key/ethereum-analytics-309308-6c01508bc0b8.json"
service_account_info = pd.read_json(data_location, typ="series") 
credentials = service_account.Credentials.from_service_account_info(service_account_info)

NameError: name 'service_account' is not defined

In [None]:
train_x_raw = feature_extraction_training_instances(num_instances=10000)
train_x_raw.head()

In [None]:
def feature_extraction_training_instances(num_instances):

    # Pandas GBQ, wrapper
    # module provides a wrapper for Google’s BigQuery analytics web service

    address_features = pandas_gbq.read_gbq(query, credentials=credentials)
    return address_features