In [33]:
!python --version

Python 3.12.12


In [34]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [35]:
from google.cloud import bigquery
from google.colab import drive
import torch
from torch_geometric.data import Data
import os
import pandas as pd

In [36]:
project_id = 'cs467-project-479503'
bq_clinet = bigquery.Client(project = project_id)

In [37]:
# wallet_to_wallet_edge_query = """
# SELECT
#     from_address,
#     to_address,
#     COUNT(*) as num_transactions,
#     SUM(value) as total_value_wei,
#     AVG(value) as avg_value_wei,
#     MIN(value) as min_value_wei,
#     MAX(value) as max_value_wei,
#     AVG(gas) as avg_gas_used,
#     AVG(gas_price) as avg_gas_price,
#     MIN(block_timestamp) as first_interaction,
#     MAX(block_timestamp) as last_interaction,
#     COUNT(DISTINCT DATE(block_timestamp)) as active_days
# FROM `bigquery-public-data.crypto_ethereum.transactions`
# WHERE block_timestamp BETWEEN '2024-01-01' AND '2024-01-31'
#     AND to_address IS NOT NULL
#     AND value > 0
# GROUP BY from_address, to_address
# HAVING num_transactions >= 2
# """

# edges_df = bq_client.query(wallet_to_wallet_edge_query).to_dataframe()
# drive.mount('/content/drive')

# output_filepath = '/content/drive/MyDrive/cs467-project/wallet_to_wallet_edges.csv'

# edges_df.to_csv(output_filepath, index=False)

In [38]:
# node_features_query = """
# WITH wallet_stats AS (
#   SELECT
#     from_address as wallet,
#     COUNT(*) as outgoing_tx_count,
#     SUM(value) as total_sent_wei,
#     AVG(value) as avg_sent_wei,
#     STDDEV(value) as stddev_sent_wei,
#     AVG(gas_price) as avg_gas_price,
#     COUNT(DISTINCT to_address) as unique_recipients,
#     COUNT(DISTINCT DATE(block_timestamp)) as active_days,
#     MIN(block_timestamp) as first_tx,
#     MAX(block_timestamp) as last_tx
#   FROM `bigquery-public-data.crypto_ethereum.transactions`
#   WHERE block_timestamp BETWEEN '2024-01-01' AND '2024-01-31'
#   GROUP BY from_address
# ),
# incoming_stats AS (
#   SELECT
#     to_address as wallet,
#     COUNT(*) as incoming_tx_count,
#     SUM(value) as total_received_wei,
#     AVG(value) as avg_received_wei,
#     COUNT(DISTINCT from_address) as unique_senders
#   FROM `bigquery-public-data.crypto_ethereum.transactions`
#   WHERE block_timestamp BETWEEN '2024-01-01' AND '2024-01-31'
#     AND to_address IS NOT NULL
#   GROUP BY to_address
# )
# SELECT
#   COALESCE(w.wallet, i.wallet) as wallet,
#   COALESCE(w.outgoing_tx_count, 0) as outgoing_tx_count,
#   COALESCE(w.total_sent_wei, 0) as total_sent_wei,
#   COALESCE(w.avg_sent_wei, 0) as avg_sent_wei,
#   COALESCE(w.stddev_sent_wei, 0) as stddev_sent_wei,
#   COALESCE(w.avg_gas_price, 0) as avg_gas_price,
#   COALESCE(w.unique_recipients, 0) as unique_recipients,
#   COALESCE(i.incoming_tx_count, 0) as incoming_tx_count,
#   COALESCE(i.total_received_wei, 0) as total_received_wei,
#   COALESCE(i.avg_received_wei, 0) as avg_received_wei,
#   COALESCE(i.unique_senders, 0) as unique_senders,
#   COALESCE(w.active_days, 0) as active_days,
#   w.first_tx,
#   w.last_tx
# FROM wallet_stats w
# FULL OUTER JOIN incoming_stats i ON w.wallet = i.wallet
# """

# edges_df = bq_client.query(node_features_query).to_dataframe()
# drive.mount('/content/drive')

# output_filepath = '/content/drive/MyDrive/cs467-project/node_features_query.csv'

# edges_df.to_csv(output_filepath, index=False)

In [45]:
wallet_to_wallet_edge_path = '/content/drive/MyDrive/cs467-project/wallet_to_wallet_edges.csv'
node_features_path = '/content/drive/MyDrive/cs467-project/node_features_query.csv'

edges_df = pd.read_csv(wallet_to_wallet_edge_path, nrows=100000)
nodes_df = pd.read_csv(node_features_path, nrows=100000)

edges_df['num_transactions'] = pd.to_numeric(edges_df['num_transactions'], errors='coerce')
edges_df['total_value_wei'] = pd.to_numeric(edges_df['total_value_wei'], errors='coerce')
edges_df['avg_value_wei'] = pd.to_numeric(edges_df['avg_value_wei'], errors='coerce')

# Fill any NaN values with 0
edges_df[['num_transactions', 'total_value_wei', 'avg_value_wei']] = \
    edges_df[['num_transactions', 'total_value_wei', 'avg_value_wei']].fillna(0)

  edges_df = pd.read_csv(wallet_to_wallet_edge_path, nrows=100000)


In [46]:
unique_wallets = pd.concat([
    edges_df['from_address'],
    edges_df['to_address']
]).unique()

wallet_to_idx = {wallet: idx for idx, wallet in enumerate(unique_wallets)}

# Build edge index
edge_index = torch.tensor([
    [wallet_to_idx[addr] for addr in edges_df['from_address']],
    [wallet_to_idx[addr] for addr in edges_df['to_address']]
], dtype=torch.long)

# Build edge attributes
edge_attr = torch.tensor(
    edges_df[['num_transactions', 'total_value_wei', 'avg_value_wei']].values,
    dtype=torch.float64
)

nodes_df.set_index('wallet', inplace=True)
nodes_dict = nodes_df.to_dict('index')

# Build node features matrix (much faster)
feature_cols = ['outgoing_tx_count', 'incoming_tx_count', 'total_sent_wei',
                'total_received_wei', 'unique_recipients', 'unique_senders', 'active_days']

node_features_list = []
for wallet in unique_wallets:
    if wallet in nodes_dict:
        features = nodes_dict[wallet]
        node_features_list.append([features[col] for col in feature_cols])
    else:
        node_features_list.append([0] * len(feature_cols))

node_features = torch.tensor(node_features_list, dtype=torch.float)

# Normalize features
node_features = (node_features - node_features.mean(dim=0)) / (node_features.std(dim=0) + 1e-8)

# Create PyG Data object
data = Data(
    x=node_features,
    edge_index=edge_index,
    edge_attr=edge_attr
)