## Extract ENS Data from BigQuery and Ethereum node

In [1]:
import json
import pandas as pd
import numpy as np
from ens import ENS
from web3 import Web3, HTTPProvider
from tqdm.notebook import tqdm
import ipfshttpclient
import content_hash
import multicodec
import multihash
from IPython.display import display, HTML

from config import ETH_URL
from ens_data import get_ens_data
from src.utils_bigquery import get_df


EXTRACT_NAMES = False
AGGREGATE_ENS_DATA = False

tqdm.pandas()

w3 = Web3(HTTPProvider(ETH_URL))
ns = ENS.fromWeb3(w3)
ipfs_client = ipfshttpclient.connect()

In [2]:
def display_df(df: pd.DataFrame, show_index: bool = False) -> None:
    display(HTML(df.to_html(index=show_index, notebook=True, show_dimensions=False)))

In [3]:
# 'ENS': '0x57f1887a8bf19b14fc0df6fd9b2acc9af147ea85'

### Get ENS Names from Blockchain ETL datasets (BigQuery)

In [4]:
query_1 = f'''
    SELECT DISTINCT LOWER(name) as name
    FROM (
        SELECT name
        FROM `blockchain-etl.ethereum_ens.ShortNameAuctionController_event_NameRegistered`

        UNION ALL

        SELECT name
        FROM `blockchain-etl.ethereum_ens.ETHRegistrarController_event_NameRegistered`

        UNION ALL

        SELECT name
        FROM `blockchain-etl.ethereum_ens.ETHRegistrarController2_event_NameRegistered`

        UNION ALL

        SELECT name
        FROM `blockchain-etl.ethereum_ens.ETHRegistrarController3_event_NameRegistered`
    )
'''
if EXTRACT_NAMES:
    ens_names_df = get_df(query=query_1)
    ens_names_df.to_csv('data/ens_names.csv')
else:
    ens_names_df = pd.read_csv('data/ens_names.csv')
print(f'Total names: {len(ens_names_df):>,}')

Total names: 2,565,489


In [5]:
ens_names_list = ens_names_df.name.to_list()
ens_names_list[:10]

['thrawn',
 'danielhoang',
 'edwin2',
 'girardperregaux',
 'ardaghgroup',
 'transped',
 'degenreactor',
 'jokerkong',
 'sco',
 'ethism']

### ENS Data Extraction from Ethereum node
Use `extract_ens_data_from_ethereum_node.py` for getting ENS Data for names from Ethereum node.

### Aggregate ENS Data

In [6]:
if AGGREGATE_ENS_DATA:
    ens_data_df = None
    for i in tqdm(range(253)):
        try:
            with open(f'data/temp/ens_data_{i}.csv') as my_csv:
                ens_data_item_df = pd.read_csv(my_csv, index_col=0)
            if ens_data_df is None:
                ens_data_df = ens_data_item_df
            else:
                ens_data_df = ens_data_df.append(ens_data_item_df)
        except FileNotFoundError:
            print(f'Error {i} not found')
    ens_data_df.name = ens_data_df.name.map(lambda x: x.lower())
    ens_data_df = ens_data_df.drop_duplicates()
    ens_data_df = ens_data_df.sort_values('name').reset_index()
    ens_data_df = ens_data_df[['name', 'owner', 'resolver_address', 'associated_address', 'contenthash']]
    ens_data_df.to_csv('data/ens_data_row.csv')
else:
    ens_data_df = pd.read_csv('data/ens_data_row.csv', index_col=0)
display_df(ens_data_df.head(20))

name,owner,resolver_address,associated_address,contenthash
0.eth,0x03B7bA1730170e6120529902479A026f7CDB60C6,,,
8.eth,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,
6.eth,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,
8.eth,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,
0.eth,0x94567Fa65d284b51CE7832B3CD2682a2277dc964,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0x94567Fa65d284b51CE7832B3CD2682a2277dc964,
8.eth,0x7c6d5cf17D05fDD70fB051366a474F7b422d86a6,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0x7c6d5cf17D05fDD70fB051366a474F7b422d86a6,
.eth,0x30a0063FfBb27C11FFD22Dd5321E8B07b259083d,,,
0.eth,0xf8d06F8338abA317afc9b55dF03D41Bf71f927Dd,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0xf8d06F8338abA317afc9b55dF03D41Bf71f927Dd,
6969.eth,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0x00000000c9c32755D496d3F13DcDf06A436C9C7a,
8.eth,0x4a083475Fd0BfB1eBB05d03D29794bc64696F3AF,0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,0x4a083475Fd0BfB1eBB05d03D29794bc64696F3AF,


### Get items with content hashes

In [7]:
def decode_contenthash(contenthash: str, extract_ipns: bool = False) -> str:
    try:
        if contenthash[:2] == 'e3':
            content_cid = content_hash.decode(contenthash)
            return content_cid if len(content_cid) == 46 else ''
        elif extract_ipns and contenthash[:2] == 'e5' and len(contenthash) == 84:
            ipns_address = content_hash.decode(contenthash)
            return ipfs_client.resolve('/ipns/' + ipns_address, reversed=True)
        # elif contenthash[:2] == 'e5':
        #     _buffer = multihash.from_hex_string(contenthash)
        #     return multicodec.remove_prefix(_buffer)
        else:
            return ''
    except (ValueError, ipfshttpclient.exceptions.ErrorResponse):
        return ''


if AGGREGATE_ENS_DATA:
    ens_data_with_contenthash_df = ens_data_df[
        (~ens_data_df.contenthash.isna()) & (~ens_data_df.contenthash.isin(["b''"]))].copy().sort_index()
    ens_data_with_contenthash_df.loc[:, 'content_cid'] = ens_data_with_contenthash_df.contenthash.map(
        decode_contenthash)
    ens_data_with_contenthash_df.to_csv('data/ens_data_with_content_hash.csv')

    ens_data_with_cid_df = ens_data_with_contenthash_df[
        ens_data_with_contenthash_df.content_cid != ''].copy().sort_index()
    ens_data_with_cid_df.loc[:, 'name_cid'] = ens_data_with_cid_df.name.progress_map(
        lambda x: ipfs_client.add_str(x[:-4]))
    ens_data_with_cid_df.loc[:, 'owner_cid'] = \
        ens_data_with_cid_df.loc[
            ens_data_with_cid_df.owner == ens_data_with_cid_df.owner,
            'owner'].progress_map(lambda x: ipfs_client.add_str(x))
    ens_data_with_cid_df.loc[:, 'associated_address_cid'] = \
        ens_data_with_cid_df.loc[
            ens_data_with_cid_df.owner == ens_data_with_cid_df.associated_address,
            'owner_cid'].append(
                ens_data_with_cid_df.loc[
                (ens_data_with_cid_df.associated_address == ens_data_with_cid_df.associated_address) & (ens_data_with_cid_df.owner != ens_data_with_cid_df.associated_address),
                    'associated_address'].progress_map(lambda x: ipfs_client.add_str(x)))

    ens_data_with_cid_df.to_csv('data/ens_data_with_content_cid.csv')
else:
    ens_data_with_contenthash_df = pd.read_csv('data/ens_data_with_content_hash.csv', index_col=0)
    ens_data_with_cid_df = pd.read_csv('data/ens_data_with_content_cid.csv', index_col=0)

In [8]:
for column in ens_data_df.columns:
    print(
        f'''{column}: {len(ens_data_df[(~ens_data_df[column].isna()) & (~ens_data_df[column].isin(["b''", '0x0000000000000000000000000000000000000000']))]):>,} ({len(ens_data_df[(~ens_data_df[column].isna()) & (~ens_data_df[column].isin(["b''", '0x0000000000000000000000000000000000000000']))]) / len(ens_data_df) * 100:.2f}%)''')
print(
    f'''content cid: {len(ens_data_with_cid_df):>,} ({len(ens_data_with_cid_df) / len(ens_data_df) * 100:.2f}%)''')
print(
    f'''\nassociated_address!=owner: {len(ens_data_df[(ens_data_df.associated_address != ens_data_df.owner) & (~ens_data_df.associated_address.isna())]):>,}''')

name: 2,517,359 (100.00%)
owner: 2,502,120 (99.39%)
resolver_address: 1,888,448 (75.02%)
associated_address: 1,866,318 (74.14%)
contenthash: 11,398 (0.45%)
content cid: 5,651 (0.22%)

associated_address!=owner: 46,602


### Content Codecs

In [9]:
def get_contenthash_codec(chash: str) -> str:
    try:
        return multicodec.get_codec(
            multihash.from_hex_string(chash.lstrip('0x')))
    except ValueError:
        return chash[:2]

ens_data_with_contenthash_df.loc[:, 'content_codec'] = ens_data_with_contenthash_df.contenthash.map(get_contenthash_codec)
display_df(
    ens_data_with_contenthash_df.groupby('content_codec')['name'].agg(name_cnt=np.count_nonzero).reset_index().sort_values('name_cnt', ascending=False)
)

content_codec,name_cnt
ipfs-ns,5665
ipns-ns,2621
00,2500
swarm-ns,510
90,41
onion3,23
onion,16
f8,2
cb,1
multicodec,1


### Resolver Contracts

In [10]:
display_df(
    ens_data_df.groupby('resolver_address').name.agg(name_cnt=np.count_nonzero).reset_index().sort_values('name_cnt', ascending=False)
)

resolver_address,name_cnt
0x4976fb03C32e5B8cfe2b6cCB31c09Ba78EBaBa41,1881273
0x226159d592E2b063810a10Ebf6dcbADA94Ed68b8,2571
0xD3ddcCDD3b25A8a7423B5bEe360a42146eb4Baf3,1495
0x1da022710dF5002339274AaDEe8D58218e9D6AB5,1345
0x9C4e9CCE4780062942a7fe34FA2Fa7316c872956,489
0x5FfC014343cd971B7eb70732021E26C35B744cc4,483
0xB37671329ABE589109b0bDD1312cc6ACcF106259,317
0xeE2BB8598725445B532BDb14F522A99E04e84B38,58
0x5d68796477640aA3BdDD9f8b9Ad0DE06f5fd2Df9,15
0x57f1887a8BF19b14fC0dF6Fd9B2acc9Af147eA85,12
