# Data Analysis

In [7]:
import json
import pickle
import warnings

import numpy as np
import os
import pandas as pd
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [8]:
# switch the working directory to project root
if os.getcwd().split('/')[-1] == 'code':
    os.chdir('..')
print(os.getcwd())

/Users/crinstaniev/Dev/ethereum/merge-data-challenge


## Validators and Status Data

In [9]:
validators_json = (json
                   .load(open('data/validators.json', 'r'))
                   .get('data'))


In [10]:
# status: eligible, activated, exit
num_epochs = 156490
mat = np.zeros(shape=(num_epochs, 4), dtype=int)

for validator in tqdm(validators_json):
    validator = validator.get('validator')
    slashed = validator.get('slashed')

    activation_eligibility_epoch = int(validator.get(
        'activation_eligibility_epoch'))
    activation_epoch = int(validator.get('activation_epoch'))
    withdrawable_epoch = int(validator.get('withdrawable_epoch'))
    exit_epoch = int(validator.get('exit_epoch'))

    payload = np.zeros(shape=(num_epochs, 4), dtype=int)

    if activation_eligibility_epoch < num_epochs:
        payload[activation_eligibility_epoch:, 0] = 1
    if activation_epoch < num_epochs:
        payload[activation_epoch:, :] = 0
        payload[activation_epoch:, 1] = 1
    if withdrawable_epoch < num_epochs:
        payload[withdrawable_epoch:, :] = 0
        payload[withdrawable_epoch:, 2] = 1
    if exit_epoch < num_epochs:
        payload[exit_epoch:, :] = 0
        payload[exit_epoch:, 3] = 1
    mat += payload


100%|██████████| 457515/457515 [03:45<00:00, 2025.70it/s]


In [11]:
status_df = pd.DataFrame(
    mat, columns=['eligible', 'activated', 'withdrawable', 'exited'])
status_df.to_csv('data/status.csv')

In [15]:
block_df = pd.read_json('data/beacon_blocks.json', lines=True)

In [17]:
block_df_clean = block_df[[
    'block_slot', 'block_epoch', 'block_timestamp', 'proposer_index', 'eth1_deposit_count', 'attestations'
]]


In [18]:
block_df_clean = block_df_clean.dropna()
block_df_clean['block_timestamp'] = block_df_clean['block_timestamp'].astype(
    'datetime64[ns]')
block_df_clean['proposer_index'] = block_df_clean['proposer_index'].astype(int)


In [19]:
def get_attestation_num(lst):
    return len(lst)

block_df_clean['attestations_cnt'] = block_df_clean['attestations'].apply(get_attestation_num)

In [20]:
def get_attester_idx(lst):
    payload = []
    for item in lst:
        payload.append(item['index'])
    return payload

block_df_clean['attester'] = block_df_clean['attestations'].apply(get_attester_idx)

In [None]:
block_df_clean.to_pickle('data/block_df.pkl')