## Detecting Cryptocurrency Fraud with Graph Embeddings 

Financial institutions are starting to integrate their digital payment systems with with cryptocurrency markets. A challenge that these institutions face when operating in crypto environments consists in avoiding carrying out transactions with accounts that are involved with fraudulent activities (drug dealing, terrorism, ransomware, etc.).


In light of this challenge, a number of credit rating agencies have started to provide fraudulent scores of Bitcoin addresses to help financial institutions decide whether they should engage in a transaction with a given address or not.


The objective of this project consists in extracting covariates associated with Bitcoin addresses via graph embeddings from the bitcoin blockchain and then use these covariates to predict fraudulent addresses using a sample of labelled addresses.

In [1]:
# Imports
from pathlib import Path
import os,sys,inspect
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import numpy as np
import zipfile
import seaborn as sns 
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import plotly.offline as py 
%matplotlib inline
import random
import string
import re
import math
import json
import pickle
import operator
import glob

# Pre-processing
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict as cvp
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.ensemble import RandomForestClassifier
# import keras
# import tensorflow as tf
# from tensorflow import keras

# Classification model
import tensorflow as tf
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Accuracy check
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from plotly.offline import iplot, plot, init_notebook_mode
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Graph embedding
import networkx as nx
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
from sklearn.metrics.pairwise import pairwise_distances
from gensim.models import Word2Vec
from networkx.algorithms.community.label_propagation import label_propagation_communities
import community # python-louvain
import stellargraph as sg
from stellargraph.data import BiasedRandomWalk, UnsupervisedSampler
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.layer import Node2Vec as n2v_stellar
from stellargraph.layer import link_classification
from node2vec import Node2Vec as n2v

# Visualization 
# import igraph as ig
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import iplot, plot, init_notebook_mode
from bokeh.io import output_notebook, show, save
from bokeh.models import *
from bokeh.plotting import figure, show, from_networkx
from bokeh.palettes import viridis, Viridis8
from bokeh.transform import linear_cmap
from networkx.algorithms import community
from sklearn.manifold import TSNE
from IPython.display import display, HTML
from community import community_louvain
output_notebook()

# Ignore matplotlib warnings
import warnings
warnings.filterwarnings("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Shared instances
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.

2022-05-17 20:46:42.639276: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-17 20:46:42.639382: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Pro


## Step 1. Data pre-processing

In [2]:
# Read dataset
trans_3w = pd.read_csv(r'trans_3w.csv')

In [3]:
trans_3w.head()

Unnamed: 0,txn_hash,input_address,output_address,ammount,fees,block_index,block_time,input_flag,output_flag
0,bd36f2ca16e2a2c73c807b7d1569657b30de8453450cd2...,13Uf71d8y94xEk2LX7GCtaBJmPiahhA7TR,16FPyvvz5Ug3cx97qH67KfgC6PY1S9fskQ,24200000.0,320000.0,453318,2017-02-16 12:05:04,0,1
1,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,166zajP74bcRVo7BmdeDME3mRX3Mi9e3xn,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,2503.648,3.314845,453318,2017-02-16 12:05:04,0,1
2,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1LU3DtRE3XK32WxFqrnaT9k99nRgwHtLHd,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,635940.2,841.988605,453318,2017-02-16 12:05:04,0,1
3,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1HVQNFf7vDpJVZk7tEzbFxnmALSezA2qPD,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,590236.9,781.47725,453318,2017-02-16 12:05:04,0,1
4,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1LU3DtRE3XK32WxFqrnaT9k99nRgwHtLHd,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,101303.2,134.126076,453318,2017-02-16 12:05:04,0,1


In [4]:
trans_3w.dtypes

txn_hash           object
input_address      object
output_address     object
ammount           float64
fees              float64
block_index         int64
block_time         object
input_flag          int64
output_flag         int64
dtype: object

In [5]:
trans_3w.shape

(14033260, 9)

## Step 2. Exporatory data analysis

In [85]:
# Load input/output label fules
input_labels = pickle.load(open('data/input_labels.txt','rb'))
print("Number of input labels: ", len(input_labels))

output_labels = pickle.load(open('data/output_labels.txt','rb'))
print("Number of output labels: ", len(output_labels))

# List of fraudulent adresses
fraud_addresses = list(input_labels) + list(output_labels)

Number of input labels:  182
Number of output labels:  4429


In [86]:
trans_3w['input_address'].isin(fraud_addresses).value_counts()

False    13199622
True       833638
Name: input_address, dtype: int64

In [87]:
trans_3w['output_address'].isin(fraud_addresses).value_counts()

False    13470254
True       563006
Name: output_address, dtype: int64

In [90]:
trans_3w['block_time'].unique()

array(['2017-02-16 12:05:04', '2017-02-16 12:07:12',
       '2017-02-16 12:17:46', ..., '2017-03-09 10:30:30',
       '2017-03-09 10:31:28', '2017-03-09 10:34:29'], dtype=object)

In [91]:
# Number of unique transactions - M2,9
trans_3w['txn_hash'].nunique() # shows considerable number of group transactions

2235580

In [92]:
# Number of unique input addresses - M2,9 
trans_3w['input_address'].nunique()

2905358

In [93]:
# Number of transactions per account
trans_3w['input_address'].value_counts() # shows large discrepancy in activity between accounts

3PUuiYu5cFMsagkffArrKZzQFtWdHttU3x          87190
3CD1QW6fjgTwKq3Pj97nty28WZAVkziNom          69304
97333c48ad0f0d6d33efc6cbf153966b2139aec5    58000
19JRz9seZ7DG5LVa9HkXC1bkMzxvnuhsAy          57574
b78f207cb338c16dce1ff785a940d6f878a70f3a    51887
                                            ...  
1569TQzfNeh7ZM6jAHj6fPKAF4opkhPdFG              1
15nrwkdA7zNGpXJzs5QvAKuJ9VKMabNPkq              1
1BahGBGWUFupzoVLWZZCkbGirreMvqqcY5              1
1Q78Q7k8BDK8LBs9hwcnFiPW3DuZgs4YST              1
13Uf71d8y94xEk2LX7GCtaBJmPiahhA7TR              1
Name: input_address, Length: 2905358, dtype: int64

In [94]:
# Number of unique output addresses
trans_3w['output_address'].nunique()

3267728

In [95]:
# Number of transactions per account
trans_3w['output_address'].value_counts() # smaller range than input_address but still shows large discrepancy in activity

97333c48ad0f0d6d33efc6cbf153966b2139aec5    58250
b78f207cb338c16dce1ff785a940d6f878a70f3a    52000
3PUuiYu5cFMsagkffArrKZzQFtWdHttU3x          47728
f3a17c35fd995f7a3aee089703ab81bcb4ef0b28    43700
0499c77cab212f94c796955355557c40b1f472ee    35350
                                            ...  
1DFwhbqotMc3SADPQBq2GbmDsVqgSVm1zo              1
1K7kVkxuXFL1zhETH3oKBk97j6uGT9S9NN              1
14YFLKf6pYyJndKUmCAZFXChv4hGT11Pcv              1
16Lnq73Z5xgFZJsnnguhbo7CRwAnGxpkP6              1
1AGtcEBTPtqWNw6BxEy3h9QBEr3ajEWTSW              1
Name: output_address, Length: 3267728, dtype: int64

## Step 3. Descriptive statistics

In [96]:
# Descriptive stats
trans_3w.describe() # very high levels of standard deviation

Unnamed: 0,ammount,fees,block_index,input_flag,output_flag,month,day
count,14033120.0,14033120.0,14033260.0,14033260.0,14033260.0,14033260.0,14033260.0
mean,36481950.0,10994.49,454663.7,0.001551528,0.03936755,2.334144,15.71614
std,639409700.0,42832.67,988.4161,0.03935888,0.1944678,0.4716904,8.474728
min,-3035458000.0,0.0,453318.0,0.0,0.0,2.0,1.0
25%,7080.253,37.90049,453624.0,0.0,0.0,2.0,7.0
50%,194499.7,798.9566,454676.0,0.0,0.0,2.0,17.0
75%,2986816.0,11993.97,455507.0,0.0,0.0,3.0,23.0
max,275399200000.0,39258640.0,456437.0,1.0,1.0,3.0,28.0


In [97]:
# Number of unique timestamps
trans_3w['block_time'].nunique() # some blocks were minted at the same time

1592

In [98]:
# Features with default 0
add_feat = ['input_flag', 'output_flag']
trans_3w = trans_3w.assign(**dict.fromkeys(add_feat, 0))

# Feature with default 1
trans_3w = trans_3w.assign(**dict.fromkeys(['group_trans'], 1))

trans_3w.head()

Unnamed: 0,txn_hash,input_address,output_address,ammount,fees,block_index,block_time,input_flag,output_flag,month,day,group_trans
0,bd36f2ca16e2a2c73c807b7d1569657b30de8453450cd2...,13Uf71d8y94xEk2LX7GCtaBJmPiahhA7TR,16FPyvvz5Ug3cx97qH67KfgC6PY1S9fskQ,24200000.0,320000.0,453318,2017-02-16 12:05:04,0,0,2,16,1
1,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,166zajP74bcRVo7BmdeDME3mRX3Mi9e3xn,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,2503.648,3.314845,453318,2017-02-16 12:05:04,0,0,2,16,1
2,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1LU3DtRE3XK32WxFqrnaT9k99nRgwHtLHd,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,635940.2,841.988605,453318,2017-02-16 12:05:04,0,0,2,16,1
3,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1HVQNFf7vDpJVZk7tEzbFxnmALSezA2qPD,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,590236.9,781.47725,453318,2017-02-16 12:05:04,0,0,2,16,1
4,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1LU3DtRE3XK32WxFqrnaT9k99nRgwHtLHd,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,101303.2,134.126076,453318,2017-02-16 12:05:04,0,0,2,16,1


In [99]:
# Filter through transaction ids to find those with multiple sellers and buyers
txn_counts = trans_3w['txn_hash'].value_counts().loc[lambda x : x>1]

# converting to df and assigning new names to the columns
txn_counts = pd.DataFrame(txn_counts)
txn_counts = txn_counts.reset_index()
txn_counts.columns = ['txn_hash', 'txn_counts']
txn_counts.shape # size is too large to reasonably iterate through

(1947068, 2)

In [100]:
# Transaction id counts inspection
txn_counts.head() # The first is a distinct outlier

Unnamed: 0,txn_hash,txn_counts
0,4a7b8c0b2eb30207c48a857d79ed40bb532d93a3e62e17...,80427
1,5e549c35fc9e48b6e8f543e7682d34eeae17fc59d97c10...,38064
2,8f4e7cb0ad80daf4d6694cf22a89cd882dab8e1cb3f831...,31545
3,01370e462f9eb440cdfdd0cfa092783686ed0320c22038...,29452
4,18f19b8b8c2156f224e0c5544605deb775728ebc1deb31...,21684


In [101]:
txn_uniq = trans_3w['txn_hash'].value_counts().loc[lambda x : x==1]

# converting to df and assigning new names to the columns
txn_uniq = pd.DataFrame(txn_uniq)
txn_uniq = txn_uniq.reset_index()
txn_uniq.columns = ['txn_hash', 'txn_counts']

# Get list of transactions that appear once
one_txn = txn_uniq['txn_hash'].tolist()
len(one_txn)

288512

In [102]:
# Create flag for transactions with multiple buyers and sellers

# for i in progressbar(range(288512)):
#    for i in one_txn:
#        trans_3w.loc[trans_3w.txn_hash == i, 'group_trans'] = 0
#    trans_3w['group_trans'].value_counts() # Estimated 3/4 days

In [103]:
# Create flag for fraudulent/high-risk input address
for i in input_fraud:
    trans_3w.loc[trans_3w.input_address == i, 'input_flag'] = 1

trans_3w['input_flag'].value_counts()

0    14033260
Name: input_flag, dtype: int64

## Step 4. Visualize network properties

### Create Graph

In [8]:
# Create directed and weighted graph most suitable for our dataset
Direct_Weight_G = nx.from_pandas_edgelist(trans_3w, 'output_address', 'input_address', edge_attr=['txn_hash', 'ammount', 'fees', 
                                'block_index', 'block_time'], create_using=nx.DiGraph())

In [9]:
print(f"The full network has {Direct_Weight_G.number_of_nodes()} nodes.")
print(f"The full network has {Direct_Weight_G.number_of_edges()} edges.")

The full network has 4667173 nodes.
The full network has 10545282 edges.


In [106]:
# Save graph 
nx.write_gml(Direct_Weight_G, "Direct_Weight_G.gml")

In [11]:
# Create directed graph as alternative
Direct_G = nx.from_pandas_edgelist(trans_3w, 'output_address', 'input_address', create_using=nx.DiGraph())

In [12]:
print(f"The full network has {Direct_G.number_of_nodes()} nodes.")
print(f"The full network has {Direct_G.number_of_edges()} edges.")

The full network has 4667173 nodes.
The full network has 10545282 edges.


In [107]:
# Save graph 
nx.write_gml(Direct_G, "Direct_G.gml")

In [14]:
# Create undirected graph as many graph properties are suitable only for undirected graph
Undirect_G = nx.from_pandas_edgelist(trans_3w, 'output_address', 'input_address', create_using=nx.Graph())

In [15]:
print(f"The full network has {Undirect_G.number_of_nodes()} nodes.")
print(f"The full network has {Undirect_G.number_of_edges()} edges.") # slightly less than previous graphs

The full network has 4667173 nodes.
The full network has 10533149 edges.


In [108]:
# Save graph 
nx.write_gml(Undirect_G, "Undirect_G.gml")

### Graph Representation

In [17]:
# Obtain full network NetworkX Graph
G_full = nx.from_pandas_edgelist(trans_3w, 'output_address', 'input_address', edge_attr=['txn_hash', 'ammount', 'fees', 
                                'block_index', 'block_time'], create_using=nx.DiGraph())

In [18]:
nx.is_directed(G_full)

True

In [19]:
print(f"The full network has {G_full.number_of_nodes()} nodes.")
print(f"The full network has {G_full.number_of_edges()} edges.")

The full network has 4667173 nodes.
The full network has 10545282 edges.


In [None]:
# Node count check
in_ls = trans_3w['input_address'].tolist()
out_ls = trans_3w['output_address'].tolist()

node_ls = in_ls + out_ls

len(set(node_ls)) # match to number of nodes confirmed

In [20]:
# Create a mask with only the first records
mask = ~trans_3w[['input_address', 'output_address']].duplicated()
# Get a list of nodes with attributes
nodes = trans_3w[mask][['input_address', 'output_address', 'input_flag', 'output_flag']]

# Add the attributes one at a time.
in_dict = nodes.set_index('input_address')['input_flag'].to_dict()
nx.set_node_attributes(G_full, in_dict, 'input_flag')

out_dict = nodes.set_index('output_address')['output_flag'].to_dict()
nx.set_node_attributes(G_full, out_dict, 'output_flag')

In [21]:
# input_address fraud subset
input_fraud = trans_3w[trans_3w['input_flag'] == 1]

In [22]:
# Obtain input fraud nodes NetworkX Graph
G_infraud = nx.from_pandas_edgelist(input_fraud, 'output_address', 'input_address', create_using=nx.DiGraph())

nx.is_directed(G_infraud)

True

In [23]:
print(f"The fraudulent input address network has {G_infraud.number_of_nodes()} nodes.")
print(f"The fraudulent input address network has {G_infraud.number_of_edges()} edges.")

The fraudulent input address network has 10813 nodes.
The fraudulent input address network has 15677 edges.


In [24]:
# output_address fraud subset
output_fraud = trans_3w[trans_3w['output_flag'] == 1]

In [25]:
# Obtain output fraud nodes NetworkX Graph
G_outfraud = nx.from_pandas_edgelist(output_fraud, 'output_address', 'input_address', create_using=nx.DiGraph())

nx.is_directed(G_outfraud)

True

In [26]:
print(f"The fraudulent input address network has {G_outfraud.number_of_nodes()} nodes.")
print(f"The fraudulent input address network has {G_outfraud.number_of_edges()} edges.")

The fraudulent input address network has 156599 nodes.
The fraudulent input address network has 198957 edges.


### Node Attributes

In [28]:
node_properties = pd.DataFrame()

#### Degree

In [29]:
degree = [val for (node, val) in G_full.degree()]
in_degree = [val for (node, val) in G_full.in_degree()]
out_degree = [val for (node, val) in G_full.out_degree()]
address = [node for (node, val) in G_full.degree()]

In [30]:
node_properties['address'] = address
node_properties['degree'] = degree
node_properties['in_degree'] = in_degree
node_properties['out_degree'] = out_degree

#### Centrality

In [31]:
# Eigenvector Centrality
eigen_centrality = nx.eigenvector_centrality(G_full)
eigen_centrality = [eigen_centrality[node] for node in eigen_centrality]

In [84]:
eigen_centrality

[2.1780990491207018e-12,
 5.112702680407487e-14,
 3.428049297650965e-09,
 7.480086258751488e-07,
 7.480086258751488e-07,
 7.480086258751488e-07,
 4.547871186308181e-05,
 1.957655840833139e-10,
 1.1750992670058282e-08,
 7.306169334991564e-07,
 4.6838484065480656e-09,
 6.426167775876551e-11,
 6.031034442183628e-16,
 5.232807096766566e-20,
 6.268332969805998e-12,
 1.827940147746923e-10,
 2.353032062602172e-10,
 5.289597053760543e-08,
 8.793273072585153e-10,
 8.793273072585153e-10,
 8.793273072585153e-10,
 9.718839053810907e-10,
 8.793273072585153e-10,
 3.652833088417494e-07,
 8.793273072585153e-10,
 2.6949844443573745e-13,
 4.606388517791875e-10,
 4.6461808094630276e-10,
 2.6313710313363227e-08,
 1.5698421290299702e-18,
 4.6451876427569516e-10,
 3.366340449883801e-10,
 1.5972528649190153e-12,
 3.798030259912684e-14,
 1.0278728225791473e-20,
 3.452008087349531e-16,
 1.236792645917551e-16,
 1.3449454243092674e-15,
 8.288878912701738e-10,
 1.1837235510631029e-11,
 7.739873009722384e-16,
 5.3

PageRank’s main difference from EigenCentrality is that it accounts for link direction. Each node in a network is assigned a score based on its number of incoming links (its ‘indegree’). These links are also weighted depending on the relative score of its originating node.

In [83]:
# PageRank
# standard level of alpha is between 0.8/0.9

#pagerank = nx.pagerank(G_full, alpha = 0.85) #AttributeError: module 'scipy.sparse' has no attribute 'coo_array'
#pagerank = nx.pagerank_scipy(G_full, alpha = 0.85) #AttributeError: module 'scipy.sparse' has no attribute 'coo_array'
#pagerank = nx.pagerank_numpy(G_full, alpha = 0.85) #Created Numpy Array 
#pagerank = [pagerank[node] for node in pagerank]

In [33]:
# Add results to node_properties DataFrame
node_properties['eigen_centrality'] = eigen_centrality
#node_properties['pagerank'] = pagerank

It would be insightful to also add Closeness and Betweenness centrality measures. However, these calculations are based on the calculation of the number of shortest paths that pass through each nodes. This is too computationally intensive with the given size of the network (created computer crashes).

#### Summary of Node Properties

In [34]:
# Add flag
to_flag = set(input_fraud['input_address'].tolist() + output_fraud['output_address'].tolist())
node_properties = node_properties.assign(**dict.fromkeys(['fraud_flag'], 0))

for i in to_flag:
    node_properties.loc[node_properties.address == i, 'fraud_flag'] = 1
    
node_properties.head()

Unnamed: 0,address,degree,in_degree,out_degree,eigen_centrality,fraud_flag
0,16FPyvvz5Ug3cx97qH67KfgC6PY1S9fskQ,3,2,1,2.178099e-12,1
1,13Uf71d8y94xEk2LX7GCtaBJmPiahhA7TR,1,1,0,5.112703e-14,0
2,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,5,2,3,3.428049e-09,1
3,166zajP74bcRVo7BmdeDME3mRX3Mi9e3xn,2,2,0,7.480086e-07,0
4,1LU3DtRE3XK32WxFqrnaT9k99nRgwHtLHd,2,2,0,7.480086e-07,0


In [35]:
# Describe properties overall

node_properties[['degree', 'in_degree', 'out_degree', 'eigen_centrality']].describe()

Unnamed: 0,degree,in_degree,out_degree,eigen_centrality
count,4667173.0,4667173.0,4667173.0,4667173.0
mean,4.518916,2.259458,2.259458,3.304724e-05
std,39.36702,28.68514,19.98728,0.0004617038
min,1.0,0.0,0.0,9.344298e-22
25%,1.0,0.0,0.0,9.344298e-22
50%,2.0,1.0,1.0,1.962303e-20
75%,3.0,2.0,1.0,1.818528e-11
max,27919.0,25336.0,12116.0,0.1928926


In [36]:
# Describe Properties of Fraudulent Nodes

node_properties_fraud = node_properties[node_properties['fraud_flag'] == 1]

node_properties_fraud[['degree', 'in_degree', 'out_degree', 'eigen_centrality']].describe()

Unnamed: 0,degree,in_degree,out_degree,eigen_centrality
count,4604.0,4604.0,4604.0,4604.0
mean,103.652476,58.354909,45.297567,0.0007508394
std,965.689619,709.624568,385.82155,0.005882296
min,1.0,0.0,0.0,9.344298e-22
25%,3.0,1.0,1.0,2.1585329999999998e-19
50%,4.0,2.0,1.0,5.809492e-10
75%,13.0,5.0,6.0,1.423723e-06
max,27919.0,25336.0,11178.0,0.1928926


In [37]:
# Describe Properties of Non-Fraudulent Nodes

node_properties_licit = node_properties[node_properties['fraud_flag'] == 0]

node_properties_licit[['degree', 'in_degree', 'out_degree', 'eigen_centrality']].describe()

Unnamed: 0,degree,in_degree,out_degree,eigen_centrality
count,4662569.0,4662569.0,4662569.0,4662569.0
mean,4.421028,2.204067,2.216961,3.233847e-05
std,24.9186,17.98349,15.84609,0.0004227438
min,1.0,0.0,0.0,9.344298e-22
25%,1.0,0.0,0.0,9.344298e-22
50%,2.0,1.0,1.0,1.962303e-20
75%,3.0,2.0,1.0,1.779162e-11
max,15645.0,15642.0,12116.0,0.06334031


In [38]:
node_properties.to_csv("node_properties.csv", index = False)

DataFrame uploaded to Google Drive Folder

### Graph Attributes

#### Density

In [39]:
# Full Network
nx.density(G_full)

4.841171772770945e-07

In [40]:
# Fraudulent input addresses
nx.density(G_infraud)

0.0001340944237556231

In [41]:
# Fraudulent output addresses
nx.density(G_outfraud)

8.113047677155055e-06

All density figures are close to zero, indicating that a very low portion of potential connections/transactions occur.

####  Network Diameter

In [42]:
# Obtain Longest Shortest Path - Network Diameter

#for i in progressbar(range(100)):
#    nx.diameter(G_full.to_undirected()) -> Too Computationally intensive

## Step 5. Other graph properties

### NetworkX library

In [57]:
# Build the graphs
graph_nx = nx.from_pandas_edgelist(trans_3w, source = 'input_address', target = 'output_address', 
                                   create_using = nx.Graph())

In [58]:
# Create a mask with only the first records
mask = ~trans_3w[['input_address', 'output_address']].duplicated()

# Get a list of nodes with attributed
nodes = trans_3w[mask][['input_address', 'output_address', 'input_flag', 'output_flag']]

In [59]:
# Add the attributes one at a time
in_dict = nodes.set_index('input_address')['input_flag'].to_dict()
nx.set_node_attributes(graph_nx, in_dict, 'input_flag')

out_dict = nodes.set_index('output_address')['output_flag'].to_dict()
nx.set_node_attributes(graph_nx, out_dict, 'output_flag')

In [60]:
# Subset of input address fraud 
input_fraud = trans_3w[trans_3w['input_flag'] == 1]
input_fraud.head()

Unnamed: 0,txn_hash,input_address,output_address,ammount,fees,block_index,block_time,input_flag,output_flag,month,day
74,4501f4acfe0e57de30aa0fb05310095dbac71aab009129...,1PhmMsdwamJA6soKw5mNMXxzGomHEHWY5P,1BjmDut3JhzbQ1ik9gzm9uzTQNZ9veNFUD,150000.0,23.226948,453318,2017-02-16 12:05:04,1,1,2,16
75,4501f4acfe0e57de30aa0fb05310095dbac71aab009129...,1PhmMsdwamJA6soKw5mNMXxzGomHEHWY5P,1PhmMsdwamJA6soKw5mNMXxzGomHEHWY5P,710890500.0,110078.773052,453318,2017-02-16 12:05:04,1,1,2,16
130,470b620ace0d8aaac071c152ba1e7f6f8604dac2857800...,17eK8iMpNwMCmUm7DUE2GqWtm1VupPGTCW,1HxtzEb65kyyRZpt4WBMu4nR5wNMNjz8e5,6259.293,1.670261,453318,2017-02-16 12:05:04,1,1,2,16
131,470b620ace0d8aaac071c152ba1e7f6f8604dac2857800...,1GKmCumeG8QxvneT7A2Pe4jKKEdyCQTo6w,1HxtzEb65kyyRZpt4WBMu4nR5wNMNjz8e5,845004.6,225.485231,453318,2017-02-16 12:05:04,1,1,2,16
132,470b620ace0d8aaac071c152ba1e7f6f8604dac2857800...,12VpNLvUEBZvHL8WzjRcvoZKJ4KPdbJHko,1HxtzEb65kyyRZpt4WBMu4nR5wNMNjz8e5,13329.63,3.556944,453318,2017-02-16 12:05:04,1,1,2,16


In [61]:
input_fraud.shape

(21773, 11)

In [62]:
trans_3w.shape

(14033260, 11)

In [63]:
# Subset of output address fraud 
output_fraud = trans_3w[trans_3w['output_flag'] == 1]
output_fraud.head()

Unnamed: 0,txn_hash,input_address,output_address,ammount,fees,block_index,block_time,input_flag,output_flag,month,day
0,bd36f2ca16e2a2c73c807b7d1569657b30de8453450cd2...,13Uf71d8y94xEk2LX7GCtaBJmPiahhA7TR,16FPyvvz5Ug3cx97qH67KfgC6PY1S9fskQ,24200000.0,320000.0,453318,2017-02-16 12:05:04,0,1,2,16
1,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,166zajP74bcRVo7BmdeDME3mRX3Mi9e3xn,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,2503.648,3.314845,453318,2017-02-16 12:05:04,0,1,2,16
2,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1LU3DtRE3XK32WxFqrnaT9k99nRgwHtLHd,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,635940.2,841.988605,453318,2017-02-16 12:05:04,0,1,2,16
3,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1HVQNFf7vDpJVZk7tEzbFxnmALSezA2qPD,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,590236.9,781.47725,453318,2017-02-16 12:05:04,0,1,2,16
4,8c852e187a0541cd8ea8c93a6c728843b5f8b9c579b6fc...,1LU3DtRE3XK32WxFqrnaT9k99nRgwHtLHd,1ASaHGPN8qRuqZkpnR7d2tcndU9uHL6aGj,101303.2,134.126076,453318,2017-02-16 12:05:04,0,1,2,16


In [64]:
output_fraud.shape

(552455, 11)

In [65]:
# Graph density
number = nx.density(graph_nx)
print(f"Overall graph density throughout all time stamp is {number:.8f}")

Overall graph density throughout all time stamp is 0.00000097


In [66]:
# Average clustering
number = nx.average_clustering(graph_nx)
print(f"Overall average clustering throughout all time stamp is {number:.8f}")

Overall average clustering throughout all time stamp is 0.02611629


In [67]:
# Identify cliques
len(list(nx.find_cliques(graph_nx)))

10152418

A clique is in some sense a stronger version of a community. A set of nodes forms a clique (equivalently, a complete subgraph) if all possible connections between nodes exist. A two-node clique is simply two connected nodes. A three node clique is also known as a triangle.

In [68]:
# PageRank
page_rank = nx.pagerank(graph_nx, alpha=0.9)

for txn_hash in sorted(page_rank, key=page_rank.get, reverse=True)[:500]:
    print(txn_hash, page_rank[txn_hash])
    
# The PageRank algorithm was designed for directed graphs but this algorithm does not check if the input graph is directed and will execute on undirected graphs by converting 
# each edge in the directed graph to two edges.

17A16QmavnUfCW11DAApiJxp7ARnxN5pGX 0.002991614410648985
38DPYmsa7RXwLrhpTP6udrfSiGn5dzdGAC 0.0020355631317577577
1FvtWwP3ePJDeR6K7ZePGujUuusUTds8Vy 0.0018311103758796394
3CD1QW6fjgTwKq3Pj97nty28WZAVkziNom 0.0013882569182835074
1P9RQEr2XeE3PEb44ZE35sfZRRW1JHU8qx 0.001364126957831029
3PUuiYu5cFMsagkffArrKZzQFtWdHttU3x 0.001295741062428294
1MoYyajsjhVJE5BfDviFF8UbuefziGeDJy 0.0012183303815281488
13vHWR3iLsHeYwT42RnuKYNBoVPrKKZgRv 0.0011246087134146703
1AScRhqdXMrJyxNmjEapMZi1PLFsqmLquG 0.0010491147424790805
1A9J4XT7cbdv2p2YMAvmm36qauVAs1iPga 0.0010110300356545316
1Btc2UrZTqcbjLrr8trx66K9TozQmKeDGN 0.0009284219082938395
1Jyk78QLGTi4e1DbYqgBizQujAPskxtsU3 0.0007619522326790111
1GCMZuCzCYVL9bh65qxQmwjAiZagyYEEQf 0.0007372291701553611
12cgpFdJViXbwHbhrA3TuW1EGnL25Zqc3P 0.0007288923086083982
1NmL8SJHxeBvmA9Gc7dDgTmPNda5bXz5yH 0.0006851650045656431
1FAv42GaDuQixSzEzSbx6aP1Kf4WVWpQUY 0.0006519046467465644
3A4U175prUGEn3B1gUDkz32u8fnF9Nx3Ly 0.000617598291738843
1GX28yLjVWux7ws4UQ9FB4MnLH4UKTPK2z 

Definition: PageRank is a variant of EigenCentrality, also assigning nodes a score based on their connections, and their connections’ connections. The difference is that PageRank also takes link direction and weight into account – so links can only pass influence in one direction, and pass different amounts of influence.

What it tells us: This measure uncovers nodes whose influence extends beyond their direct connections into the wider network.

When to use it: Because it takes into account direction and connection weight, PageRank can be helpful for understanding citations and authority.

A bit more detail: PageRank is famously one of the ranking algorithms behind the original Google search engine (the ‘Page’ part of its name comes from creator and Google founder, Larry Page).

In [69]:
# Community detection label propagation
communities = label_propagation_communities(graph_nx) # only for unidrected graphs
len([community for community in communities])

961609

In [70]:
# Community detection with python-louvain
partition = community_louvain.best_partition(graph_nx)
len(set(partition.values()))

209066

In [111]:
# Compute the giant component of the overall graph
Gcc = sorted(nx.connected_components(graph_nx), key=len, reverse=True)
G0 = graph_nx.subgraph(Gcc[0])

print(nx.info(G0))

Graph with 3852506 nodes and 9842639 edges


A giant component is a connected component of a network that contains a significant proportion of the entire nodes in the network.

In [112]:
# Save the giant component 
nx.write_gml(G0, "Giant Component.gml")

In [113]:
# The second strongest component
Gcc = sorted(nx.connected_components(graph_nx), key=len, reverse=True)
G1 = graph_nx.subgraph(Gcc[1])

print(nx.info(G1))

Graph with 5526 nodes and 5561 edges


In [72]:
# # Compute the average shortest path of the giant component
# sp = nx.average_shortest_path_length(G0)
# sp

In [73]:
G0_nodes = list(G0.nodes)

In [74]:
# Count number of fraudulent input addresses from giant component
count = sum(f in G0_nodes for f in input_fraud['input_address'])
print(count)

21773


In [75]:
# Unique fraudulent input addresses from giant component
G0_nodes_set = set(G0_nodes)
input_fraud_set = set(input_fraud['input_address'])

count = len(G0_nodes_set.intersection(input_fraud_set))
print(count) # all fraudulent input addresses 

182


In [105]:
# Number of illicit input nodes within the giant component - same as the overall graph
print(type(input_fraud_set))
print("\nLength of the set: ", len(input_fraud_set))

<class 'set'>

Length of the set:  182


In [77]:
# Count number of fraudulent output addresses from giant component
count = sum(f in G0_nodes for f in output_fraud['output_address'])
print(count)

552170


In [78]:
G0_nodes_set = set(G0_nodes)
output_fraud_set = set(output_fraud['output_address'])

count = len(G0_nodes_set.intersection(output_fraud_set))
print(count)  # all fraudulent output addresses 

4263


In [104]:
# Number of illicit output nodes within the giant component - 34 nodes less compared to the overall graph
print(type(output_fraud_set))
print("\nLength of the set: ", len(output_fraud_set))

<class 'set'>

Length of the set:  4429


In [80]:
# Compute weakly connected component of the overall graph
Gcc = sorted(nx.connected_components(graph_nx), key=len, reverse=False)
G2 = graph_nx.subgraph(Gcc[0])

print(nx.info(G2))

Graph with 1 nodes and 1 edges
