In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
from __future__ import print_function, division
import numpy as np 
import pandas as pd 
import os
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve


In [2]:
tx_features = ["tx_feat_"+str(i) for i in range(2,95)]
agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
fea_col = ["txId","time_step"] + tx_features + agg_features
features = pd.read_csv("/content/drive/MyDrive/ORI_EVOLVE/elliptic_bitcoin_dataset/elliptic_txs_features.csv",header=None,names=fea_col)
classes = pd.read_csv("/content/drive/MyDrive/ORI_EVOLVE/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")

In [11]:
def split_data_af(data):
  X_train = data[data['time_step']<35][tx_features+agg_features]
  X_test = data[data['time_step']>=35][tx_features+agg_features]
  y_train = data[data['time_step']<35]['class']
  y_test= data[data['time_step']>=35]['class']
  return X_train,X_test,y_train,y_test

In [12]:
def split_data(data):
  X_train = data[data['time_step']<35][tx_features+agg_features+embed_names]
  X_test = data[data['time_step']>=35][tx_features+agg_features+embed_names]
  y_train = data[data['time_step']<35]['class']
  y_test= data[data['time_step']>=35]['class']
  return X_train,X_test,y_train,y_test

In [13]:
def split_data_emb(data):
  X_train = data[data['time_step']<35][embed_names]
  X_test = data[data['time_step']>=35][embed_names]
  y_train = data[data['time_step']<35]['class']
  y_test= data[data['time_step']>=35]['class']
  return X_train,X_test,y_train,y_test

In [14]:
def ratio_data(data):
  data_lic = data[data['class']==0]
  data_ill = data[data['class']==1]
  ratio = data_ill.shape[0]*(0.3/0.7)/data_lic.shape[0]
  data_lic = data_lic.sample(frac=ratio,random_state=0)
  data = pd.concat([data_lic,data_ill])
  return data

## RF(AF)

In [15]:
data = pd.merge(features,classes,on='txId')
data = ratio_data(data)
X_train, X_test, y_train, y_test = split_data_af(data)
clf = RandomForestClassifier(n_estimators=50, max_features=50,random_state=0,n_jobs=-1).fit(X_train,y_train)
preds = clf.predict(X_test)
prec,rec,f1,num = precision_recall_fscore_support(y_test,preds, average=None)
print("AF RandomForest Results")
print("Precision:%.3f \nRecall:%.3f \nF1 Score:%.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test,preds,average='micro')
print("Micro-Average F1 Score:",micro_f1)

AF RandomForest Results
Precision:0.906 
Recall:0.688 
F1 Score:0.782
Micro-Average F1 Score: 0.7721032399780342


## RF(AF+SDNE)

In [16]:
emb = pd.read_csv("/content/drive/MyDrive/SDNE/sdne_feature_e500_b9.txt")
tx_features = ["tx_feat_"+str(i) for i in range(2,95)]
agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
embed_names = ["emb_feat_"+str(i) for i in range(1,65)]
data = pd.merge(features,emb,on='txId')
data = pd.merge(data,classes,on='txId')
data.columns = ["txId","time_step"] + tx_features + agg_features + embed_names + ['class']
data = ratio_data(data)
X_train, X_test, y_train, y_test = split_data(data)
clf = RandomForestClassifier(n_estimators=50, max_features=50,random_state=0,n_jobs=-1).fit(X_train,y_train)
preds = clf.predict(X_test)
prec,rec,f1,num = precision_recall_fscore_support(y_test,preds, average=None)
print("AF+SDNE RandomForest Results")
print("Precision:%.3f \nRecall:%.3f \nF1 Score:%.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test,preds,average='micro')
print("Micro-Average F1 Score:",micro_f1)

AF+SDNE RandomForest Results
Precision:0.886 
Recall:0.697 
F1 Score:0.780
Micro-Average F1 Score: 0.7666117517847335


## RF(SDNE)

In [17]:
emb = pd.read_csv("/content/drive/MyDrive/SDNE/sdne_feature_e500_b9.txt")
tx_features = ["tx_feat_"+str(i) for i in range(2,95)]
agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
embed_names = ["emb_feat_"+str(i) for i in range(1,65)]
data = pd.merge(features,emb,on='txId')
data = pd.merge(data,classes,on='txId')
data.columns = ["txId","time_step"] + tx_features + agg_features + embed_names + ['class']

data = ratio_data(data)
X_train, X_test, y_train, y_test = split_data_emb(data)
clf = RandomForestClassifier(n_estimators=50, max_features=50,max_depth=100,random_state=0).fit(X_train,y_train)
preds = clf.predict(X_test)
prec,rec,f1,num = precision_recall_fscore_support(y_test,preds, average=None)
print("AF+SDNE RandomForest Results")
print("Precision:%.3f \nRecall:%.3f \nF1 Score:%.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test,preds,average='micro')
print("Micro-Average F1 Score:",micro_f1)

AF+SDNE RandomForest Results
Precision:0.652 
Recall:0.605 
F1 Score:0.628
Micro-Average F1 Score: 0.5733113673805601


## RF(AF+DSDNE)

In [18]:
emb = pd.read_csv("/content/drive/MyDrive/SDNE/dsdne_feature_e500_b9.txt")
tx_features = ["tx_feat_"+str(i) for i in range(2,95)]
agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
embed_names = ["emb_feat_"+str(i) for i in range(1,65)]
data = pd.merge(features,emb,on='txId')
data = pd.merge(data,classes,on='txId')
data.columns = ["txId","time_step"] + tx_features + agg_features + embed_names + ['class']
data = ratio_data(data)
X_train, X_test, y_train, y_test = split_data(data)

clf = RandomForestClassifier(n_estimators=50, max_features=50,random_state=0,n_jobs=-1).fit(X_train,y_train)
preds = clf.predict(X_test)
prec,rec,f1,num = precision_recall_fscore_support(y_test,preds, average=None)
print("AF+SDNE RandomForest Results")
print("Precision:%.3f \nRecall:%.3f \nF1 Score:%.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test,preds,average='micro')
print("Micro-Average F1 Score:",micro_f1)


AF+SDNE RandomForest Results
Precision:0.883 
Recall:0.695 
F1 Score:0.778
Micro-Average F1 Score: 0.7638660076880834


## RF(DSDNE)

In [None]:
emb = pd.read_csv("/content/drive/MyDrive/SDNE/dsdne_feature_e500_b9.txt")
tx_features = ["tx_feat_"+str(i) for i in range(2,95)]
agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
embed_names = ["emb_feat_"+str(i) for i in range(1,65)]
data = pd.merge(features,emb,on='txId')
data = pd.merge(data,classes,on='txId')
data.columns = ["txId","time_step"] + tx_features + agg_features + embed_names + ['class']
data = data[(data['class']==0) | (data['class']==1)]
data = ratio_data(data)
X_train, X_test, y_train, y_test = split_data_emb(data)
clf = RandomForestClassifier(n_estimators=50, max_features=50,max_depth=100,random_state=0,n_jobs=-1).fit(X_train,y_train)
preds = clf.predict(X_test)
prec,rec,f1,num = precision_recall_fscore_support(y_test,preds, average=None)
print("AF+SDNE RandomForest Results")
print("Precision:%.3f \nRecall:%.3f \nF1 Score:%.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test,preds,average='micro')
print("Micro-Average F1 Score:",micro_f1)


AF+SDNE RandomForest Results
Precision:0.622 
Recall:0.890 
F1 Score:0.732
Micro-Average F1 Score: 0.6128500823723229
