In [1]:
import pandas as pd
import numpy as np
import igraph as ig
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
#The graph is made of 203,769 nodes and 234,355 edges. Two percent (4,545) of the nodes are 
# labelled class1 (illicit).
#Twenty-one percent (42,019) are labelled class2 (licit). 
#The remaining transactions are not labelled with regard to licit versus illicit.
classes_path = "../elliptic_bitcoin_dataset/elliptic_txs_classes.csv"
edges_path = "../elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv"
features_path = "../elliptic_bitcoin_dataset/elliptic_txs_features.csv"

classes = pd.read_csv(classes_path)
edges = pd.read_csv(edges_path)
feat_cols = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
feats = pd.read_csv(features_path, header=None, names=feat_cols)
classes.columns = ['txId', 'label']
df = classes.set_index('txId').join(feats.set_index('txId'))

In [3]:
label_colors = {'1':'red', '2':'green', 'unknown':'gray'}

tx_graph = ig.Graph(directed=True)

edges_dict = {tx_id: i for i, tx_id in enumerate(classes['txId'])}

tx_graph.add_vertices(len(classes))
tx_graph.vs['id'] = list(classes['txId'])
tx_graph.vs['type'] = list(classes['label'])
tx_graph.vs['time_step'] = list(feats['time_step'])
tx_graph.vs['color'] = [label_colors[label] for label in classes['label']]

edges_list = [(edges_dict[edges['txId1'][i]], edges_dict[edges['txId2'][i]]) for i in tqdm(range(len(edges)))]
tx_graph.add_edges(edges_list)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234355/234355 [00:00<00:00, 330285.29it/s]


In [4]:
betweenness = tx_graph.betweenness(directed=True)
df.insert(1, 'betweenness', betweenness, True)

centralities = tx_graph.closeness()
df.insert(1, 'closeness', centralities, True)

in_degree = tx_graph.degree(mode="in")
df.insert(1, 'in-degree', in_degree)

out_degree = tx_graph.degree(mode='out')
df.insert(1, 'out-degree', out_degree)

clustering_coeff = tx_graph.transitivity_local_undirected(mode='zero')
df.insert(1, 'clustering_coeff', clustering_coeff)

pagerank_scores = tx_graph.pagerank(directed=True)
df.insert(1, 'pagerank', pagerank_scores, True)

In [5]:
df.head()

Unnamed: 0_level_0,label,pagerank,clustering_coeff,out-degree,in-degree,closeness,betweenness,time_step,trans_feat_0,trans_feat_1,...,agg_feat_62,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71
txId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
230425980,unknown,4e-06,0.0,1,1,0.092392,14.0,1,-0.171469,-0.184668,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
5530458,unknown,5e-06,0.0,1,1,0.08458,8.0,1,-0.171484,-0.184668,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
232022460,unknown,5e-06,0.0,2,1,0.134507,30.0,1,-0.172107,-0.184668,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
232438397,2,0.000297,0.000621,1,160,0.155365,2355.0,1,0.163054,1.96379,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
230460314,unknown,2e-06,0.266667,8,2,0.100978,16.752381,1,1.011523,-0.081127,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [13]:
#Transform the dataframe in a suitable input for KMeans. We drop the columns containing
#the correct labels
df.drop(df[df["label"] == "unknown"].index, inplace=True)

y = df['label']
X = df.drop(columns=['label'])
#X = X.loc[:, 'time_step':]
X.head()

Unnamed: 0_level_0,pagerank,clustering_coeff,out-degree,in-degree,closeness,betweenness,time_step,trans_feat_0,trans_feat_1,trans_feat_2,...,agg_feat_62,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71
txId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
232438397,0.000297,0.000621,1,160,0.155365,2355.0,1,0.163054,1.96379,-0.646376,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
232029206,7.6e-05,0.00113,1,59,0.134193,1223.0,1,-0.005027,0.578941,-0.091383,...,-0.577099,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792
232344069,2e-06,0.0,2,0,0.099405,0.0,1,-0.147852,-0.184668,-1.201369,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
27553029,2e-06,0.0,1,1,0.110372,7.0,1,-0.151357,-0.184668,-1.201369,...,-0.539735,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
3881097,2e-06,0.0,1,1,0.139805,4.0,1,-0.172306,-0.184668,-1.201369,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [15]:
clf = RandomForestClassifier(n_estimators = 100)  
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [16]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
print("Cross-validated accuracy:", scores.mean())

Cross-validated accuracy: 0.9819598237397157


In [17]:
from sklearn import metrics  
from sklearn.metrics import confusion_matrix

print("ACCURACY OF THE MODEL:", metrics.accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

ACCURACY OF THE MODEL: 0.9875447387258411


array([[ 1188,   168],
       [    6, 12608]])

In [18]:
import pandas as pd

importances = clf.feature_importances_
features = X.columns  

feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': importances
})

feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
top_features = feature_importances.head(50)

print(top_features)


           Feature  Importance
61   trans_feat_54    0.056066
59   trans_feat_52    0.043451
20   trans_feat_13    0.043441
138    agg_feat_38    0.040983
24   trans_feat_17    0.039676
55   trans_feat_48    0.032941
47   trans_feat_40    0.031784
53   trans_feat_46    0.031022
11    trans_feat_4    0.028858
49   trans_feat_42    0.026015
96   trans_feat_89    0.024891
82   trans_feat_75    0.023776
144    agg_feat_44    0.022739
150    agg_feat_50    0.014965
52   trans_feat_45    0.013877
160    agg_feat_60    0.013272
37   trans_feat_30    0.012388
35   trans_feat_28    0.012028
70   trans_feat_63    0.011311
66   trans_feat_59    0.011260
46   trans_feat_39    0.010858
169    agg_feat_69    0.010835
58   trans_feat_51    0.010689
60   trans_feat_53    0.010656
65   trans_feat_58    0.010457
95   trans_feat_88    0.010209
64   trans_feat_57    0.010166
90   trans_feat_83    0.009178
167    agg_feat_67    0.009129
162    agg_feat_62    0.008854
91   trans_feat_84    0.007914
71   tra

# Graph visualization

In [None]:
#import matplotlib.pyplot as plt
#fig, ax = plt.subplots(figsize=(20, 20)) 
#node_sizes = [c * 50 for c in centralities]
#layout = tx_graph.layout_fruchterman_reingold()
#ig.plot(tx_graph, vertex_size=node_sizes, vertex_frame_width=0.5, edge_width=0.1, bbox=(8000, 8000), margin=50, target=ax)
#plt.savefig("large_graph.png", dpi=300)
#plt.savefig("large_graph.pdf", dpi=600)