In [1]:
import sys
import os
from pyvis.network import Network
import base64
import pandas as pd

In [2]:
def load_df(start_block, end_block, d_path):
    
    def foo(d_path):
        df = None
        for file in os.listdir(d_path):
            # if csv
            if file.endswith('.csv'):
                tmp_df = pd.read_csv(os.path.join(d_path, file))
                if df is None:
                    df = tmp_df
                else:
                    pd.concat([df, tmp_df])
        return df
    
    e_path = os.path.join(d_path, 'edges-{}-{}'.format(start_block, end_block))
    v_path = os.path.join(d_path, 'vertices-{}-{}'.format(start_block, end_block))
    a_path = os.path.join(d_path, 'addresses-{}-{}'.format(start_block, end_block))
    

    e_df = foo(e_path)
    v_df = foo(v_path)
    a_df = foo(a_path)

    unknown_v_df = pd.concat([
        e_df['src_id'].to_frame().rename(columns={'src_id':'id'}), 
        e_df['dst_id'].to_frame().rename(columns={'dst_id':'id'})
    ])
    unknown_v_df = unknown_v_df.drop_duplicates().reset_index(drop=True)
    unknown_v_df = unknown_v_df[~unknown_v_df.id.isin(v_df.id)]
    unknown_v_df['note'] = 'unknown_tx'
    unknown_v_df['tx_hash'] = None
    unknown_v_df['block_height'] = -1
    unknown_v_df['block_hash'] = None
    unknown_v_df['fee'] = 0
    unknown_v_df['n_input'] = 0
    unknown_v_df['amount_input'] = 0
    unknown_v_df['n_output'] = 0
    unknown_v_df['amount_output'] = 0
    unknown_v_df['temporal_index'] = -1
    
    v_df = pd.concat([v_df, unknown_v_df]).drop_duplicates().reset_index(drop=True)

    a_df = foo(a_path)

    return e_df, v_df, a_df

In [3]:
def generate_pyvis_graph(v_df, e_df, a_df, cluster_id_selected):

    net = Network(
                height='100%',
                width='80%',
                directed=True
                )
    net.show_buttons(filter_=["physics"])

    for _, node in v_df.iterrows():
        if node.note == 'tx':
            title = 'tx_hash: {}\n'.format(node.tx_hash) + \
                'block_height: {}\n'.format(node.block_height) + \
                'n_input: {}\n'.format(node.n_input) + \
                'n_output: {}\n'.format(node.n_output) + \
                'amount_input: {} BTC\n'.format(node.amount_input/100000000) + \
                'amount_output: {} BTC\n'.format(node.amount_output/100000000) + \
                'fee: {} BTC'.format(node.fee/100000000)
            net.add_node(n_id=str(node.id), shape='dot', title=title)
        elif node.note == 'UTXO':
            title = '{}\n'.format(node.note) + \
                'amount: {} BTC'.format(node.amount_input/100000000)
            net.add_node(n_id=str(node.id), shape='square', title=title)
        elif node.note == 'coinbase':
            title = '{}\n'.format(node.note) + \
                'amount: {} BTC'.format(node.amount_output/100000000)
            net.add_node(n_id=str(node.id), shape='square', title=title)
        elif node.note == 'unknown_tx':
            title = 'unknown_tx\n' + 'tx_index: {}'.format(node.id)
            net.add_node(n_id=str(node.id), shape='square', title=title)

    for _, edge in e_df.iterrows():
        try:
            if edge.address[:4] != 'UTXO' and edge.address[:4] != 'coin':
                cluster_id = a_df[a_df['address'] == edge.address].cluster_id.iloc[0]
                title = 'address: {}\n'.format(edge.address) + \
                        'value: {} BTC\n'.format(edge.value/100000000) + \
                        'cluster_id : {}'.format(cluster_id)
                if cluster_id == cluster_id_selected:
                    net.add_edge(source=str(edge.src_id), to=str(edge.dst_id), title=title, value=edge.value/100000000, color='red')
                else:
                    net.add_edge(source=str(edge.src_id), to=str(edge.dst_id), title=title, value=edge.value/100000000)
            else:
                title = 'address: {}\n'.format(edge.address) + \
                        'value: {} BTC\n'.format(edge.value/100000000)
                net.add_edge(source=str(edge.src_id), to=str(edge.dst_id), title=title, value=edge.value/100000000)
        except:
            pass
    return net

In [4]:
def render_svg(svg_file):
    with open(svg_file, "r") as f:
        lines = f.readlines()
        svg = "".join(lines)
        b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
        html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
        html = '<p style="text-align:center;">' + html + '</p>'
        return html

In [5]:
def main(start_block, end_block, address):
    DIR = './'
    d_path = os.path.join(DIR, 'blocks-{}-{}'.format(start_block, end_block))

    e_df, v_df, a_df = load_df(start_block, end_block, d_path)

    tmp = a_df[a_df.address == address]
    if tmp.shape[0] == 0:
        print('address {} not found'.format(address))
        return
    cluster_id = tmp.cluster_id.iloc[0]
    # select all address in cluster id
    addrs_in_cluster_df = pd.DataFrame({'address' : a_df[a_df.cluster_id == cluster_id].address})

    print('address {} is in cluster {}'.format(address, cluster_id))
    print('generating cluster graphs')

    new_e_df = e_df[e_df.address.isin(addrs_in_cluster_df.address)]
    new_v_df = v_df[v_df.id.isin(new_e_df.src_id) | v_df.id.isin(new_e_df.dst_id)]
    new_e_df = e_df[e_df.src_id.isin(new_v_df.id) | e_df.dst_id.isin(new_v_df.id)]
    new_v_df = v_df[v_df.id.isin(new_e_df.src_id) | v_df.id.isin(new_e_df.dst_id)]

    n_v = new_v_df.shape[0]
    n_e = new_e_df.shape[0]
    print(n_v, n_e)

    if n_v + n_e > 1400:
        print('WARNING: Too large graph to plot: {} nodes, {} edges'.format(n_v, n_e))
    new_pyvis_graph = generate_pyvis_graph(new_v_df, new_e_df, a_df, cluster_id)

    new_pyvis_graph.height = '800px'
    new_pyvis_graph.width = '2400px'
    cluster_dir = os.path.join(d_path, '{}-{}-{}'.format(cluster_id, start_block, end_block))
    if os.path.exists(cluster_dir):
        os.system('rm -rf {}'.format(cluster_dir))
    os.mkdir(cluster_dir)
    new_pyvis_graph.write_html(os.path.join(cluster_dir, 'graph.html'))
    new_e_df.to_csv(os.path.join(cluster_dir, 'edges.csv'), index=False)
    new_v_df.to_csv(os.path.join(cluster_dir, 'vertices.csv'), index=False)
    addrs_in_cluster_df.to_csv(os.path.join(cluster_dir, 'addresses.csv'), index=False)
    return

https://www.blockchain.com/explorer/transactions/btc/a1075db55d416d3ca199f55b6084e2115b9345e16c5cf302fc80e9d5fbf5d48d

In [6]:
start_block = 0
end_block = 115000
address = '1XPTgDRhN8RFnzniWCddobD9iKZatrvH4'
main(start_block, end_block, address)


        Enter a bitcoin address to be clustered using the following heuristics:

        - Satoshi heuristic

        - Coinbase transaction mining address clustering heuristic

        - Common-input-ownership heuristic

        - Single input and single output heuristic

        - Consolidation transaction heuristic

        - Payment transaction with amount payed and change address heuristic

        - Change address detection heuristic

        	- same address in input and output heuristic

        	- address reuse heuristic

        	- Unnecessary input heuristic

        	- new address in output heuristic

        	- round number heuristic

        - Mixed transaction recognition heuristic

        	- taint analysis and coinjoin sudoku
        


    
address 1XPTgDRhN8RFnzniWCddobD9iKZatrvH4 is in cluster 8589974080
generating cluster graphs
31526 21509
