In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
bitcoin_heist = pd.read_csv("data/BitcoinHeistData.csv")

In [3]:
bitcoin_heist.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [4]:
bitcoin_heist.describe()

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income
count,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0
mean,2014.475,181.4572,45.00859,0.5455192,721.6446,238.5067,2.206516,4464889000.0
std,2.257398,104.0118,58.98236,3.674255,1689.676,966.3217,17.91877,162686000000.0
min,2011.0,1.0,0.0,3.606469e-94,1.0,0.0,1.0,30000000.0
25%,2013.0,92.0,2.0,0.02148438,1.0,0.0,1.0,74285590.0
50%,2014.0,181.0,8.0,0.25,1.0,0.0,2.0,199998500.0
75%,2016.0,271.0,108.0,0.8819482,56.0,0.0,2.0,994000000.0
max,2018.0,365.0,144.0,1943.749,14497.0,14496.0,12920.0,49964400000000.0


In [5]:
bitcoin_heist.describe(include=["O"])

Unnamed: 0,address,label
count,2916697,2916697
unique,2631095,29
top,1LXrSb67EaH1LGc6d6kWHq8rgv4ZBQAcpU,white
freq,420,2875284


In [6]:
bitcoin_heist.dtypes

address       object
year           int64
day            int64
length         int64
weight       float64
count          int64
looped         int64
neighbors      int64
income       float64
label         object
dtype: object

In [7]:
bitcoin_heist

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky
...,...,...,...,...,...,...,...,...,...,...
2916692,12D3trgho1vJ4mGtWBRPyHdMJK96TRYSry,2018,330,0,0.111111,1,0,1,1.255809e+09,white
2916693,1P7PputTcVkhXBmXBvSD9MJ3UYPsiou1u2,2018,330,0,1.000000,1,0,1,4.409699e+07,white
2916694,1KYiKJEfdJtap9QX2v9BXJMpz2SfU4pgZw,2018,330,2,12.000000,6,6,35,2.398267e+09,white
2916695,15iPUJsRNZQZHmZZVwmQ63srsmughCXV4a,2018,330,0,0.500000,1,0,1,1.780427e+08,white


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn import metrics

In [9]:
bitcoin_heist["labels"] = [0 if x == 'white' else 1 for x in bitcoin_heist['label']]

In [10]:
bitcoin_heist["labels"].value_counts()

0    2875284
1      41413
Name: labels, dtype: int64

In [11]:
X = bitcoin_heist.loc[0:200000, ['year', 'day', 'length', 'weight', 'count', 'looped', 'neighbors', 'income']]
y = bitcoin_heist.loc[0:200000, 'labels']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

In [13]:
model = KNeighborsClassifier(3)

In [14]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [15]:
cm_knn = confusion_matrix(y_test, y_pred)
print(cm_knn)

[[31167   551]
 [ 1632  6651]]


In [16]:
report_knn = classification_report(y_test, y_pred)
print(report_knn)
f1_knn = f1_score(y_test, y_pred,average='weighted')
print(f1_knn)

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     31718
           1       0.92      0.80      0.86      8283

    accuracy                           0.95     40001
   macro avg       0.94      0.89      0.91     40001
weighted avg       0.94      0.95      0.94     40001

0.9439786835250186
