In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [17]:
dataset = 'synthetic' #natural, original or synthetic
if dataset == 'natural':
    classes = pd.read_csv("augmented_natural_dataset\elliptic_txs_classes.csv")
    features = pd.read_csv("augmented_natural_dataset\elliptic_txs_features.csv", header=None)
elif dataset == 'original':
    classes = pd.read_csv("elliptic_bitcoin_dataset\elliptic_txs_classes.csv")
    features = pd.read_csv("elliptic_bitcoin_dataset\elliptic_txs_features.csv", header=None)
else:
    samples = pd.read_csv("augmented_synthetic_dataset\synthetic_illicit_tx.csv")
    data = pd.read_csv("augmented_synthetic_dataset\labelled_tx.csv")
    data.columns = samples.columns
    frames = [data, samples]
    result = pd.concat(frames)

In [None]:
if (dataset == 'natural' or dataset == 'original'):
    display(features.head(5),classes.head(5))
else:
    display(result.head(5))
    display(result.groupby('class').size())

In [19]:
if (dataset == 'natural' or dataset == 'original'):
    tx_features = ["local_feat_"+str(i) for i in range(2,95)]
    agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
    features.columns = ["txId","time_step"] + tx_features + agg_features
    features = pd.merge(features,classes,left_on="txId",right_on="txId",how='left')
    features['class'] = features['class'].apply(lambda x: '0' if x == "unknown" else x)
else:
    local_features = ["Local_feature_"+str(i) for i in range(1,94)]
    agg_features = ["Aggregate_feature_"+str(i) for i in range(1,73)]

In [20]:
if (dataset == 'natural' or dataset == 'original'):
    features = features.drop(columns=['txId', 'time_step'])

In [21]:
if (dataset == 'natural' or dataset == 'original'):
    features.groupby('class').size()

In [22]:
if (dataset == 'natural'):
    features = features.replace("suspicious", "1")
    features.groupby('class').size()

In [24]:
if (dataset == 'natural' or dataset == 'original'):
    data = features[(features['class']=='1') | (features['class']=='2')] #We remove unknown transactions from the dataframe

In [25]:
if (dataset == 'natural' or dataset == 'original'):
    data.groupby('class').size()

In [26]:
if (dataset == 'natural' or dataset == 'original'):
    X = data[tx_features + agg_features]
    y = data['class']
    y = y.apply(lambda x: 0 if x == '2' else 1 )
else: 
    X = result[local_features + agg_features]
    y = result['class']
    y = y.apply(lambda x: 0 if x == 2 else 1 )

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=30, shuffle=True, stratify=y)

In [27]:
X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()
y_train_array = y_train.to_numpy()
y_test_array = y_test.to_numpy()

In [28]:
import numpy as np
X_train_array = np.reshape(X_train_array, (X_train_array.shape[0], 1, X_train_array.shape[1]))
X_test_array = np.reshape(X_test_array, (X_test_array.shape[0], 1, X_test_array.shape[1]))

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten

In [30]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
print(X_train_array.shape)
print(X_test_array.shape)
print(y_train_array.shape)
print(y_test_array.shape)

In [None]:
model = Sequential()
model.add(LSTM(166, input_shape=(1, 165), return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.add(Flatten())
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_m,precision_m, recall_m])
model.fit(X_train_array, y_train_array, validation_data=(X_test_array, y_test_array), epochs=1000, batch_size=32)
print(model.summary())

In [None]:
# Final evaluation of the model
f1_score_list = list()
precision_list = list()
recall_list = list()
for r in range(10):
    loss, f1_score, precision, recall = model.evaluate(X_test_array, y_test_array, verbose=0)
    f1_score = f1_score * 100.0
    precision = precision * 100.0
    recall = recall * 100.0
    f1_score_list.append(f1_score)
    precision_list.append(precision)
    recall_list.append(recall)

import statistics

print(statistics.mean(precision_list))
print(statistics.mean(recall_list))
print(statistics.mean(f1_score_list))