# Part I: Set-up

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Staking

/content/drive/MyDrive/Staking


In [3]:
import json
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm

# Part II: Process the Validator Data


In [6]:
validators_json = (json
                   .load(open('data/validators.json', 'r'))
                   .get('data'))

In [8]:
# status: eligible, activated, exit
num_epochs = 156490 #beacon chain epoch start with number 0: https://ethscan.org/epochs
mat = np.zeros(shape=(num_epochs, 4), dtype=int) # for each epoch, identify the validators in 4 different states: 'eligible', 'activated', 'withdrawable', 'exited'

for validator in tqdm(validators_json):
    validator = validator.get('validator')
    slashed = validator.get('slashed')

    activation_eligibility_epoch = int(validator.get(
        'activation_eligibility_epoch'))
    activation_epoch = int(validator.get('activation_epoch'))
    withdrawable_epoch = int(validator.get('withdrawable_epoch'))
    exit_epoch = int(validator.get('exit_epoch'))

    payload = np.zeros(shape=(num_epochs, 4), dtype=int) #for each validator, add +1 to the validator state it belongs for each epoch

    if activation_eligibility_epoch < num_epochs:
        payload[activation_eligibility_epoch:, 0] = 1 # the validator has been eligible since the activation_eligibility_epoch
    if activation_epoch < num_epochs: 
        payload[activation_epoch:, :] = 0 #the validator's eligible status end starting after activated since the activation_epoch
        payload[activation_epoch:, 1] = 1 #the validator has been activiated since the activation_epoch
    if withdrawable_epoch < num_epochs: 
        payload[withdrawable_epoch:, :] = 0 #the validator's activated status end after withdrawed since the withdrawable epoch
        payload[withdrawable_epoch:, 2] = 1 #the validator has been withdrawable since the withdrawable_epoch
    if exit_epoch < num_epochs: 
        payload[exit_epoch:, :] = 0 #the validator's withdrawable status end after exited since the exit_epoch
        payload[exit_epoch:, 3] = 1 #the validator has been exited since the exit_epoch
    mat += payload


100%|██████████| 533448/533448 [26:18<00:00, 337.99it/s]


In [9]:
status_df = pd.DataFrame(
    mat, columns=['eligible', 'activated', 'withdrawable', 'exited'])

In [10]:
status_df.head()

Unnamed: 0,eligible,activated,withdrawable,exited
0,0,21063,0,0
1,0,21063,0,0
2,0,21063,0,0
3,0,21063,0,0
4,0,21063,0,0


In [11]:
status_df.tail()

Unnamed: 0,eligible,activated,withdrawable,exited
156485,304,452218,0,917
156486,298,452224,0,917
156487,292,452230,0,917
156488,286,452236,0,917
156489,280,452242,0,917


In [12]:
status_df.to_csv('data/status.csv')

# Part III: Process the Beacon Chain Block Data

Note: need high-RAM runtime

In [4]:
block_df = pd.read_json('output/beacon_blocks.json', lines=True)

In [5]:
block_df.head()

Unnamed: 0,item_type,block_slot,block_epoch,block_timestamp,proposer_index,skipped,block_root,parent_root,state_root,randao_reveal,graffiti,eth1_block_hash,eth1_deposit_root,eth1_deposit_count,signature,attestations,deposits,proposer_slashings,attester_slashings,voluntary_exits
0,beacon_block,4680002,146250,2022-09-12T12:00:47Z,373276.0,False,,0xe9442499af2b521c7b53ff476d01436c967ffa956278...,0x490ba5390836bd6e956c62e19e3421185d6bff689a81...,0x902597f35320a2ae53e99a8fac3fd1d414e8070d1825...,0x4c69676874686f7573652f76332e312e302d61613032...,0x9a14d9a48d6ccdc215ddbf1103e91b20a3bb0513614e...,0x309bd2edaa17edf01b0f6423ef04ff932770509e4ac8...,434383.0,,"[{'item_type': 'attestation', 'aggregation_bit...",[],[],[],[]
1,beacon_block,4680004,146250,2022-09-12T12:01:11Z,211305.0,False,,0xa9d740adfb11d9b61e2923fbb4446d5982ba57aafb86...,0xd2c5de3d788cc500895422325f492b8833a3e42bcf90...,0xaf06cc5a0caa43565bcb3eef4287a9124100d13a7a70...,0x4b696c6e000000000000000000000000000000000000...,0x9a14d9a48d6ccdc215ddbf1103e91b20a3bb0513614e...,0x309bd2edaa17edf01b0f6423ef04ff932770509e4ac8...,434383.0,,"[{'item_type': 'attestation', 'aggregation_bit...",[],[],[],[]
2,beacon_block,4680000,146250,2022-09-12T12:00:23Z,41984.0,False,,0x4489d44916f6e2208f6432f83c327f2955135d476f5a...,0x5a936c12d2fe860ed9fdfb26c9c9dd9205af624921c1...,0x82d7f61f57d390bd4dc2daa42975614a09aa0c0eabe6...,0x68756f6269706f6f6c00000000000000000000000000...,0x9a14d9a48d6ccdc215ddbf1103e91b20a3bb0513614e...,0x309bd2edaa17edf01b0f6423ef04ff932770509e4ac8...,434383.0,,"[{'item_type': 'attestation', 'aggregation_bit...",[],[],[],[]
3,beacon_block,4680001,146250,2022-09-12T12:00:35Z,179498.0,False,,0xd899e67eec06d604933d83815f89a7b7daa92098b8f0...,0x246c6e1a94d1654f6d4ddfd84477c1f4c6dec7c3682a...,0x9330ca45a6ccf2cc62067a5b79eac4c8ef338723f34a...,0x00000000000000000000000000000000000000000000...,0x9a14d9a48d6ccdc215ddbf1103e91b20a3bb0513614e...,0x309bd2edaa17edf01b0f6423ef04ff932770509e4ac8...,434383.0,,"[{'item_type': 'attestation', 'aggregation_bit...",[],[],[],[]
4,beacon_block,4680003,146250,2022-09-12T12:00:59Z,226424.0,False,,0xef0660af1f8de5a5d0b5f025228a30f12f52a367af87...,0xd0f4b324b67493eec5655c8865f0926cf89509e1990f...,0xae31a0ffbc5b42e62bdce10e719664aae06898cb5707...,0x00000000000000000000000000000000000000000000...,0x9a14d9a48d6ccdc215ddbf1103e91b20a3bb0513614e...,0x309bd2edaa17edf01b0f6423ef04ff932770509e4ac8...,434383.0,,"[{'item_type': 'attestation', 'aggregation_bit...",[],[],[],[]


In [6]:
block_df_clean = block_df[[
    'block_slot', 'block_epoch', 'block_timestamp', 'proposer_index', 'eth1_deposit_count', 'attestations'
]]

In [7]:
block_df_clean = block_df_clean.dropna()
block_df_clean['block_timestamp'] = block_df_clean['block_timestamp'].astype(
    'datetime64[ns]')
block_df_clean['proposer_index'] = block_df_clean['proposer_index'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df_clean['block_timestamp'] = block_df_clean['block_timestamp'].astype(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block_df_clean['proposer_index'] = block_df_clean['proposer_index'].astype(int)


In [17]:
pd.set_option('display.max_colwidth', None)
print(block_df_clean ["attestations"].head(1))

0    [{'item_type': 'attestation', 'aggregation_bits': '1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111', 'slot': 4680001, 'index': 10, 'beacon_block_root': '0xe9442499af2b521c7b53ff476d01436c967ffa956278dbe7baed5cff34d4a555', 'source_epoch': 146249, 'source_root': '0x13a52c9d5322d0492d9842bad66748d9773779d6697fd55f9eb1b261b24908dc', 'target_epoch': 146250, 'target_root': '0xd899e67eec06d604933d83815f89a7b7daa92098b8f0f8a2808070baf186f5b9', 'signature': '0x87cae57a4887d7187393ab6946a5c99e288fc90346011f71e04451b909855a712a7fa8c8905087717eb8a254a52130621758ff9e623e77315b3f8c6219d9fddd1b16233c8ea39d8625e780e7478cc690eff03cf096c4280f292e917836b40139'}, {'item_type': 'attestation', 'aggregation_bits': '111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111

In [8]:
def get_attestation_num(lst):
    return len(lst)

block_df_clean['attestations_cnt'] = block_df_clean['attestations'].apply(get_attestation_num)

In [9]:
def get_attester_idx(lst):
    payload = []
    for item in lst:
        payload.append(item['index'])
    return payload

block_df_clean['attester'] = block_df_clean['attestations'].apply(get_attester_idx)

In [16]:
block_df_clean.head()

Unnamed: 0,block_slot,block_epoch,block_timestamp,proposer_index,eth1_deposit_count,attestations,attestations_cnt,attester
0,4680002,146250,2022-09-12 12:00:47,373276,434383.0,"[{'item_type': 'attestation', 'aggregation_bit...",78,"[10, 11, 21, 49, 6, 29, 3, 63, 5, 36, 42, 9, 3..."
1,4680004,146250,2022-09-12 12:01:11,211305,434383.0,"[{'item_type': 'attestation', 'aggregation_bit...",94,"[19, 28, 44, 61, 31, 0, 21, 30, 16, 58, 5, 26,..."
2,4680000,146250,2022-09-12 12:00:23,41984,434383.0,"[{'item_type': 'attestation', 'aggregation_bit...",95,"[35, 19, 42, 49, 13, 43, 44, 7, 37, 34, 26, 50..."
3,4680001,146250,2022-09-12 12:00:35,179498,434383.0,"[{'item_type': 'attestation', 'aggregation_bit...",128,"[49, 59, 36, 34, 55, 60, 62, 9, 4, 54, 39, 3, ..."
4,4680003,146250,2022-09-12 12:00:59,226424,434383.0,"[{'item_type': 'attestation', 'aggregation_bit...",128,"[63, 42, 41, 54, 0, 46, 52, 57, 38, 26, 36, 33..."


In [18]:
block_df_clean.to_pickle('data/block_df.pkl')