In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
test = pd.read_csv('test_accounts.csv')
train = pd.read_csv('train.csv')
test.shape, train.shape

((6300, 1), (5826604, 4))

In [3]:
train['value'] = train['value'].astype(float)
train['gas_fee'] = train['gas'] * train['gas_price']
train.head()

Unnamed: 0,value,gas,gas_price,account,gas_fee
0,0.0,72585,11500000000,a00996,834727500000000
1,0.0,54426,11349723260,a07890,617720038148760
2,0.0,200000,14024584890,a22857,2804916978000000
3,1.089e+17,21000,11349723260,a07890,238344188460000
4,0.0,149999,32000000000,a21390,4799968000000000


In [4]:
train = train.groupby('account').mean()
print(train.shape)
print(train.isna().sum())
train.head()

(31491, 4)
value        0
gas          0
gas_price    0
gas_fee      0
dtype: int64


Unnamed: 0_level_0,value,gas,gas_price,gas_fee
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a00001,1.916964e+16,73358.818182,52090910000.0,3308872000000000.0
a00002,3.724191e+16,100129.666667,77666670000.0,4954446000000000.0
a00003,4.039974e+19,34800.0,21000000000.0,730800000000000.0
a00004,1.260368e+17,21210.526316,86642110000.0,1842474000000000.0
a00005,1.101349e+17,27860.465116,18348330000.0,450870700000000.0


In [5]:
tf = pd.merge(train, test, on='account', how='right')

In [6]:
print(tf.shape)
tf.head()

(6300, 5)


Unnamed: 0,account,value,gas,gas_price,gas_fee
0,a27890,93618280000000.0,86951.111869,3050435000.0,260232500000000.0
1,a29649,7206806000000.0,131479.870246,28501310000.0,4119353000000000.0
2,a28243,49679570000.0,82650.023424,37992080000.0,3067196000000000.0
3,a07155,7471264000000000.0,428157.039715,93162170000.0,3.686012e+16
4,a15576,1003361000000000.0,115034.567974,7041525000.0,816046400000000.0


In [7]:
tf.isna().sum()

account      0
value        2
gas          2
gas_price    2
gas_fee      2
dtype: int64

In [8]:
# fill missing values with mean
tf['value'] = tf['value'].fillna(tf['value'].mean())
tf['gas'] = tf['gas'].fillna(tf['gas'].mean())
tf['gas_price'] = tf['gas_price'].fillna(tf['gas_price'].mean())
tf['gas_fee'] = tf['gas_fee'].fillna(tf['gas_fee'].mean())
tf.isna().sum()

account      0
value        0
gas          0
gas_price    0
gas_fee      0
dtype: int64

In [9]:
x = tf.drop(['account'], axis=1)
x.head()

Unnamed: 0,value,gas,gas_price,gas_fee
0,93618280000000.0,86951.111869,3050435000.0,260232500000000.0
1,7206806000000.0,131479.870246,28501310000.0,4119353000000000.0
2,49679570000.0,82650.023424,37992080000.0,3067196000000000.0
3,7471264000000000.0,428157.039715,93162170000.0,3.686012e+16
4,1003361000000000.0,115034.567974,7041525000.0,816046400000000.0


In [10]:
x.isna().sum()

value        0
gas          0
gas_price    0
gas_fee      0
dtype: int64

In [11]:
# load the model from disk
import pickle
knn1 = pickle.load(open('knn1.pkl', 'rb'))

In [12]:
predictions = knn1.predict(x)
predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
predictions = pd.DataFrame(predictions, columns=['flag'])
predictions.head()

Unnamed: 0,flag
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [14]:
res = pd.concat([test, predictions], axis=1)
res.head()

Unnamed: 0,account,flag
0,a27890,0.0
1,a29649,0.0
2,a28243,0.0
3,a07155,0.0
4,a15576,0.0


In [17]:
# convert flag to int
res['flag'] = res['flag'].astype(int)
res.head()

Unnamed: 0,account,flag
0,a27890,0
1,a29649,0
2,a28243,0
3,a07155,0
4,a15576,0


In [18]:
res['flag'].value_counts()

0    3916
1    2384
Name: flag, dtype: int64

In [19]:
res.to_csv('submission.csv', index=False)