In [15]:
from numpy import mean
from numpy import std
import numpy as np
import pandas
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
import ast
import csv

In [7]:
def convert_dataset(dataset):
    examples = []
    for blob in dataset['txData']:
        txData = ast.literal_eval(blob)
        examples.append([
          int(txData['from'], 0) % (2 ** 30),
          (int(txData['to'], 0) if txData['to'] is not None else 0) % (2 ** 30),
          int(txData['gas'], 0),
          int(txData['gasPrice'], 0),
          (int(txData['input'][:10], 0) if txData['input'] != '0x' else 0) % (2 ** 30),
          int(txData['nonce'], 0),
        ])
    return np.array(examples)

In [8]:
def get_models():
    models = {}
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    return models

In [21]:
def evaluate_model_roc_auc(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [26]:
def evaluate_model_log_loss(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [None]:
train = pandas.read_csv('train.csv')
test = pandas.read_csv('test.csv')

In [16]:
X, y = convert_dataset(train), train['Label0']

In [17]:
models = get_models()

In [None]:
results1, results2, names = [], [], []
for name, model in models.items():
    score1 = evaluate_model_roc_auc(model, X, y)
    results1.append(score1)
    score2 = evaluate_model_log_loss(model, X, y)
    results2.append(score2)
    names.append(name)
    print('>%s %.3f (%.3f) %.3f (%.3f)' % (name, mean(score1), std(score1), mean(score2), std(score2)))

>lr 0.732 (0.043) -0.408 (0.020)
>knn 0.881 (0.002) -1.035 (0.018)
>cart 0.839 (0.002) -3.141 (0.023)
