In [29]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import ast
import csv

In [2]:
def convert_dataset(dataset):
    examples = []
    for blob in dataset['txData']:
        txData = ast.literal_eval(blob)
        examples.append([
          int(txData['from'], 0) % (2 ** 30),
          (int(txData['to'], 0) if txData['to'] is not None else 0) % (2 ** 30),
          int(txData['gas'], 0),
          int(txData['gasPrice'], 0),
          (int(txData['input'][:10], 0) if txData['input'] != '0x' else 0) % (2 ** 30),
          int(txData['nonce'], 0),
        ])
    return np.array(examples)


In [3]:
train = pandas.read_csv('train.csv')
test = pandas.read_csv('test.csv')
testFeatures = convert_dataset(test)

In [4]:
print(train.columns)
print(test.columns)

Index(['txHash', 'txData', 'txTrace', 'Label0', 'Label1'], dtype='object')
Index(['txHash', 'txData', 'txTrace'], dtype='object')


In [20]:
X, Y = convert_dataset(train), train['Label0']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=9)


In [21]:
#XGB - Classification
XGBoost_model = xgb.XGBClassifier(n_estimators=50)

XGBoost_model.fit(X_train, y_train)
y_pred = XGBoost_model.predict(X_test)

predictions = [value for value in y_pred]





In [22]:
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

0.9212613780965243
              precision    recall  f1-score   support

       False       0.96      0.95      0.95    170406
        True       0.75      0.80      0.77     34703

    accuracy                           0.92    205109
   macro avg       0.85      0.87      0.86    205109
weighted avg       0.92      0.92      0.92    205109

[[161207   9199]
 [  6951  27752]]


In [24]:
#Logistic Regression - Classification
logreg = LogisticRegression()

logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

predictions = [value for value in y_pred]

In [25]:
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

0.8251856330048901
              precision    recall  f1-score   support

       False       0.84      0.98      0.90    170406
        True       0.39      0.06      0.10     34703

    accuracy                           0.83    205109
   macro avg       0.61      0.52      0.50    205109
weighted avg       0.76      0.83      0.77    205109

[[167288   3118]
 [ 32738   1965]]


In [27]:
#KNN - Classification
knn = KNeighborsClassifier()

knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)

predictions = [value for value in y_pred]

In [28]:
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

0.8701032134133558
              precision    recall  f1-score   support

       False       0.91      0.93      0.92    170406
        True       0.63      0.57      0.60     34703

    accuracy                           0.87    205109
   macro avg       0.77      0.75      0.76    205109
weighted avg       0.87      0.87      0.87    205109

[[158666  11740]
 [ 14903  19800]]


In [None]:
#SVM - Classification
svm = SVC()

svm.fit(X_train,y_train)
y_pred=svm.predict(X_test)

predictions = [value for value in y_pred]

In [None]:
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

In [None]:
# submission = csv.writer(open('submission.csv', 'w', encoding='UTF8'))
# for x, y in zip(binaryPredictions, regressionPredictions):
#   submission.writerow([x, y])