In [282]:
import pandas as pd
import folium
import matplotlib.pyplot as plt
%matplotlib inline

import json
import numpy as np

from collections import Counter
from folium.plugins import MarkerCluster
from sklearn.cluster import DBSCAN

from utils import *
from FedRank import FedRank

In [283]:
from sklearn.metrics import hinge_loss

In [284]:
# develop mode
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [285]:
# Moscow coordinates
MOW = [55.75222, 37.61556]

r = 0.02
r2 = r**2

In [286]:
# load, clean and group train data
train_df = clean_df('./data/train_set.csv')
train_gr = train_df.groupby('customer_id')

test_df = clean_df('./data/test_set.csv')
test_gr = test_df.groupby('customer_id')

  if self.run_code(code, result):
  if self.run_code(code, result):


### MCC
Подгружаем MCC коды, вычисляем их частотность в тренировочных данных. Выбираем MMC, которые встречаются более 10 000 раз и сохраняем их в mcc_list.

In [287]:
# load mcc codes
mcc_codes = pd.read_csv('./data/mcc_codes.csv', index_col='mcc')
mcc_counts = train_df['mcc'].value_counts().to_frame(name = 'count')
mcc_codes = mcc_counts.merge(mcc_codes, left_index=True, right_index=True)
mcc_list = mcc_codes.query('count >10000').index.values

In [303]:
best_dbscan_params = {'eps': 0.036, 'min_samples': 2}
clusters = get_dbscan_clust(train_gr, best_dbscan_params, mcc_list=mcc_list)

In [304]:
train_df = train_df.merge(clusters, left_index=True, right_index=True, how='inner')
train_df.clust_label.fillna(-2, inplace=True)
t_gr = train_df.groupby(['customer_id','clust_label'])

In [305]:
train_df.shape

(1074568, 20)

### Создаём допольнительные фичи
Проходимся по всем кластерам всех пользователей и считаем удельный вклад каждого MCC по количеству операций.

На выходе получаем таблицу, в которой колонки соответсвуют MCC

In [306]:
### Create features with relative contributions of each MCC code for each cluster of each customer
def get_mcc_features(gr):
    df = gr.mcc.aggregate('value_counts', **{'normalize':True}).unstack(level=-1).fillna(0)
    df.columns = ["mcc_%s" % (n1) for n1 in df.columns.values]
    return df


def get_amount_features(gr):
    agg_dict = {
    'amount': [np.mean, np.max, np.min, 'count', 'std', 'sum']
    }
    c = gr.aggregate(agg_dict)
    newidx = []
    for (n1,n2) in c.columns.ravel():
        newidx.append("%s_%s" % (n1,n2))
    c.columns=newidx
    
    rel_features = c.loc[:,['amount_sum','amount_count']].groupby(level=0).transform(lambda x: x/x.sum())
    rel_features.columns = [x+'_rel' for x in rel_features.columns]
    
    return c.merge(rel_features, left_index=True, right_index=True)

def get_clust_pos(gr):
    agg_dict = {
        'pos_address_lat': np.mean,
        'pos_address_lon': np.mean
        }
    return gr.aggregate(agg_dict)

In [307]:
train_features = get_amount_features(t_gr).merge(get_mcc_features(t_gr), 
                                                        left_index=True, right_index=True)

In [308]:
dist_dict = {
    'home_add_lat': 'median',
    'home_add_lon': 'median',
    'work_add_lat': 'median',
    'work_add_lon': 'median'
    
}

train_hw_pos = train_df.groupby(['customer_id','clust_label']).aggregate(dist_dict)
train_hw_pos = train_hw_pos.merge(get_clust_pos(t_gr), left_index=True, right_index=True)
train_hw_pos.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,home_add_lat,home_add_lon,work_add_lat,work_add_lon,pos_address_lat,pos_address_lon
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0001f322716470bf9bfc1708f06f00fc,0,44.708,37.775,44.735,37.798,44.728065,37.794945
0001f322716470bf9bfc1708f06f00fc,1,44.708,37.775,44.735,37.798,56.251347,43.446254
0001f322716470bf9bfc1708f06f00fc,2,44.708,37.775,44.735,37.798,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,-1,55.799,37.388,,,45.102668,38.980771
0007297d86e14bd68bd87b1dbdefe302,0,55.799,37.388,,,55.809137,37.462241
0007297d86e14bd68bd87b1dbdefe302,1,55.799,37.388,,,55.801699,37.403
0007297d86e14bd68bd87b1dbdefe302,2,55.799,37.388,,,55.822604,37.524894
0007297d86e14bd68bd87b1dbdefe302,3,55.799,37.388,,,55.675554,37.505458
0007297d86e14bd68bd87b1dbdefe302,4,55.799,37.388,,,55.748555,37.543497
0007297d86e14bd68bd87b1dbdefe302,5,55.799,37.388,,,55.746633,37.875885


In [309]:
target_dist = get_target_distances(train_hw_pos)
target_rank = target_dist.fillna(999).groupby(level=0).transform(lambda x: (x.rank()==1).astype(int))

In [312]:
target_rank.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,dist_home,dist_work
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1
0001f322716470bf9bfc1708f06f00fc,0,1,1
0001f322716470bf9bfc1708f06f00fc,1,0,0
0001f322716470bf9bfc1708f06f00fc,2,0,0
0007297d86e14bd68bd87b1dbdefe302,-1,0,0
0007297d86e14bd68bd87b1dbdefe302,0,0,0
0007297d86e14bd68bd87b1dbdefe302,1,1,0
0007297d86e14bd68bd87b1dbdefe302,2,0,0
0007297d86e14bd68bd87b1dbdefe302,3,0,0
0007297d86e14bd68bd87b1dbdefe302,4,0,0
0007297d86e14bd68bd87b1dbdefe302,5,0,0


In [313]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [340]:
y_pr = pd.DataFrame(index= train_features.index, columns=['y'])
target_rank = target_rank.dist_home.to_frame()
coeffs = np.random.rand(train_features.shape[1], 1)

def step(df, coeffs):
    y_pr.y  = np.matmul(df,coeffs)
    y2 = y_pr.fillna(0).groupby(level=0).transform(lambda x: softmax(x)).\
           merge(target_rank, left_index=True, right_index=True)
    y2 = y2.assign(p = 1 + y2.y - y2.dist_home).assign(inv = np.abs(y2.dist_home-1))
    y2 = y2.assign(p2 = y2.p*y2.inv)
    gr4 = y2.p2.groupby(level = 0)
    y4 = gr4.aggregate('sum')
    y5 = gr4.aggregate('count')
    score = (y4*y5/(y5-1)).sum()
    return score

In [368]:
#Init coeffs
coeffs = 0.1*np.random.rand(feat_num, 1)
score_list = []

In [373]:
coeffs

array([[-0.37264959],
       [ 0.98858322],
       [-0.70325856],
       [ 6.68557042],
       [ 0.70021967],
       [-1.00386307],
       [ 0.33490562],
       [ 1.39314754],
       [ 0.08844006],
       [ 0.10063246],
       [ 0.15938622],
       [ 0.35371481],
       [ 0.21888252],
       [ 0.03019763],
       [ 0.05748139],
       [-0.00825942],
       [-0.0580452 ],
       [ 0.32551507],
       [ 0.11550606],
       [ 0.134479  ],
       [-0.7561023 ]])

In [375]:
coeffs = np.load('coeffs.npy')

In [None]:
# params
eps = 1e-3
learning_rate = 10

feat_num = train_features.shape[1]
s0 = step(train_features, coeffs)
print(s0)


for i in range(int(1e5)):
    # choose random feature and make a step
    feature_id = np.random.randint(feat_num)
    temp_coeffs = coeffs
    temp_coeffs[feature_id] += eps
    
    # calculate score and gradien
    s_new = step(train_features, temp_coeffs)
    
    diff = s_new-s0
    coeffs[feature_id] -= learning_rate*diff
    
    s0 = step(train_features, coeffs)
    print(i, feature_id, s0)
    score_list.append({'score':s0, 'feature':feature_id, 'diff':diff})
    
    if (i % 50 ==0):
        np.save('coeffs.npy',coeffs)
        print('Save coeffs to file')

66590.36419738317
0 7 66590.36284041055
Save coeffs to file
1 14 66590.3588690132
2 18 66590.35707582477
3 7 66590.35571899367
4 13 66590.3492330303
5 6 66590.34553659713
6 7 66590.34418307201
7 19 66590.34380640513
8 6 66590.34011865151
9 14 66590.33615340346
10 9 66590.33448673086
11 10 66590.33234863325
12 4 66590.23753420098
13 7 66590.23618069214
14 12 66590.22769090615
15 16 66590.16315069245
16 13 66590.156692988
17 8 66590.15662730443
18 18 66590.15482447723
19 5 72325.68034789767
20 11 72320.26769424333
21 16 72319.92644075475
22 8 72319.9158810499
23 9 72319.9098725728
24 16 72319.57312933679
25 1 72254.8917896996
26 9 72254.88973383245
27 11 72248.21565939774
28 1 72187.10473661764
29 12 72186.98667110637
30 2 72185.96327156486
31 7 72176.23942107306
32 19 72176.23551841122
33 1 72135.04438772597
34 4 72134.3863018011
35 1 72108.5200038552
36 18 72108.49058546979
37 15 72108.25812338428
38 1 72077.92479505137
39 17 72077.89174331294
40 6 72067.47219467728
41 18 72067.4191243

335 11 66541.87794631874
336 6 66541.87718301639
337 14 66541.87630447406
338 9 66541.87631815542
339 14 66541.87544127529
340 0 66541.87540318561
341 9 66541.87541685477
342 10 66541.87526628126
343 7 66541.87482471425
344 11 66541.73711565281
345 5 66520.77835974794
346 1 66520.74665101529
347 9 66520.73653452465
348 13 66520.73552401253
349 6 66520.73464809835
350 19 66520.73352070765
Save coeffs to file
351 5 66525.05122708547
352 4 66525.03299459894
353 10 66525.03263971239
354 0 66525.01671240339
355 12 66525.01523582742
356 20 66525.01505539886
357 20 66525.01487451722
358 17 66525.00986447028
359 12 66525.00839050139
360 7 66525.0073435549
361 13 66525.00174471887
362 1 66524.88266625261
363 0 66524.86524228482
364 12 66524.8636783917
365 4 66524.84486615143
366 7 66524.84380824958
367 11 66524.81295120626
368 0 66524.79512123307
369 15 66524.79074026429
370 5 66535.6280530715
371 14 66535.62789200917
372 20 66535.62721133094
373 4 66535.61258014516
374 15 66535.60707349004
375

666 18 66523.62942768904
667 14 66523.62944992438
668 10 66523.62867286036
669 6 66523.6268690472
670 5 66535.89747910772
671 19 66535.89730357844
672 18 66535.89594366227
673 12 66535.89493246865
674 0 66535.89441797152
675 1 66535.85315522832
676 11 66535.72196477005
677 18 66535.72061256193
678 8 66535.71974233627
679 19 66535.71956338508
680 11 66535.58908098805
681 15 66535.58505719542
682 14 66535.58396243252
683 14 66535.58287030458
684 7 66535.58230498366
685 2 66535.56921399842
686 7 66535.56864929691
687 0 66535.56827427285
688 12 66535.56726391858
689 17 66535.56377130364
690 8 66535.56289635127
691 4 66535.54901361027
692 9 66535.54903618553
693 1 66535.51031062673
694 16 66535.476568624
695 19 66535.47638715676
696 9 66535.47640836176
697 10 66535.47611397199
698 5 66516.65287343513
699 14 66516.65288807274
700 1 66516.23834635993
Save coeffs to file
701 16 66516.20799524453
702 12 66516.20634005062
703 19 66516.20623124253
704 13 66516.20091436566
705 13 66516.19560106492

997 2 66509.14177326084
998 11 66509.1252597131
999 11 66509.10885977944
1000 4 66509.07671709586
Save coeffs to file
1001 5 66510.4993047858
1002 12 66510.49615980749
1003 5 66534.57516284577
1004 8 66534.57481965472
1005 20 66534.57426138596
1006 14 66534.57376372669
1007 10 66534.57351358121
1008 11 66534.52693181463
1009 1 66534.26863346176
1010 14 66534.26816244618
1011 13 66534.26757502896
1012 14 66534.26710483665
1013 12 66534.26502524657
1014 16 66534.22746576421
1015 9 66534.21178941561
1016 13 66534.21120425586
1017 11 66534.16506163149
1018 14 66534.16461150642
1019 19 66534.16306931751
1020 1 66533.9026033899
1021 8 66533.90234710484
1022 13 66533.90170396956
1023 15 66533.8988458392
1024 4 66533.89024831545
1025 17 66533.87392884473
1026 4 66533.86529419413
1027 11 66533.8195438803
1028 20 66533.818906656
1029 0 66533.73214186505
1030 17 66533.71559459907
1031 6 66533.70897856426
1032 1 66533.44557363243
1033 5 66511.23283317179
1034 3 66505.00996127378
1035 0 66505.00195

1315 13 66519.6497970569
1316 18 66519.64836739466
1317 11 66519.55655405749
1318 1 66519.54544059733
1319 18 66519.5440073309
1320 16 66519.51760774502
1321 12 66519.51368266699
1322 16 66519.48727823803
1323 11 66519.39672834151
1324 2 66519.3965126149
1325 17 66519.38408994598
1326 0 66519.38143420065
1327 19 66519.38134175984
1328 8 66519.38136316292
1329 0 66519.37877588368
1330 1 66519.36755477221
1331 3 66516.90769440736
1332 18 66516.90438348791
1333 15 66516.90133956427
1334 12 66516.8993049911
1335 9 66516.89925278378
1336 16 66516.87473420889
1337 12 66516.87268949022
1338 12 66516.87064192476
1339 15 66516.86760018211
1340 9 66516.86754724727
1341 20 66516.8231257586
1342 13 66516.82144366924
1343 3 66513.84128509792
1344 7 66513.84128613
1345 13 66513.83987803
1346 20 66513.83226182978
1347 3 66513.79995688108
1348 6 66513.7990281335
1349 15 66513.79369533411
1350 13 66513.79231734283
Save coeffs to file
1351 1 66513.73775534476
1352 18 66513.73546512499
1353 17 66513.7248

1633 19 66507.14963897431
1634 1 66506.808099874
1635 12 66506.80112674482
1636 16 66506.76706094659
1637 18 66506.76511240946
1638 10 66506.76400183825
1639 11 66506.74864080908
1640 0 66506.6583679558
1641 14 66506.65823200265
1642 20 66506.65527982949
1643 20 66506.65234829896
1644 6 66506.64875214052
1645 18 66506.64680612832
1646 0 66506.55719549517
1647 10 66506.5560702783
1648 13 66506.55443787869
1649 3 66506.53052475341
1650 19 66506.53054344702
Save coeffs to file
1651 2 66506.50103271211
1652 8 66506.50050723791
1653 9 66506.49952866646
1654 5 66504.93200173494
1655 20 66504.92524195227
1656 19 66504.92523136639
1657 5 66529.1180474826
1658 1 66529.11634229512
1659 18 66529.11537085149
1660 7 66529.11508251268
1661 14 66529.11332771178
1662 17 66529.10209030441
1663 20 66529.07064873725
1664 2 66529.07001824724
1665 3 66529.06125061518
1666 12 66529.06016733884
1667 5 66534.5412543941
1668 9 66534.53899798662
1669 3 66530.61556785442
1670 8 66530.61431390369
1671 10 66530.61

### Обучение и тестирование

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Подбор гиперпараметров для DBSCAN

In [None]:
eps_list = np.logspace(-2, -1, 5)
min_samples_list = [5, 10, 15, 20, 25, 30]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

In [None]:
eps_list = np.logspace(-3, -1, 10)
min_samples_list = [1, 2, 3, 4, 5]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

Наилучшие результаты получены при eps = 0.036, min_samples = 1

In [None]:
train_pr = get_dbscan_score(train_gr, best_dbscan_params)

In [None]:
train_pr.head()

In [None]:
train_pr.clust_amount.hist(bins = 100)
plt.xlim([0, 20])

Теперь оценим реальный скор, который я могу выбить

In [None]:
0.5* train_pr.query('dist <= @r2').shape[0]/train_pr.shape[0]

На борде скор 0.206375

In [None]:
test_pr = get_dbscan_score(test_gr, best_dbscan_params, calc_dist= False)
test_pr.head()

In [None]:
submit = test_pr.loc[:,['home_post_lat', 'home_post_lon', 'best_post_lat',  'best_post_lon']]
submit.head()

In [None]:
submit.columns = ['best_post_lat', 'best_post_lon',  'home_post_lat', 'home_post_lon', 'dist', 'clust_size']

In [None]:
submit.to_csv('1.csv')

In [201]:
import pandas as pd
import numpy as np

columns = ['col{}'.format(i) for i in range(36)]
x = pd.DataFrame(np.random.random((1062, 36)), columns=columns)
y = pd.DataFrame(np.random.random((36, 36)))

print(np.dot(x, y).shape)
# (1062, 36)

(1062, 36)


In [90]:
y_tr = [1, 0, 0]
y_pr = [1, 0, 0]
sz = len(y_pr)
s = hinge_loss(y_tr, y_pr) * sz/(sz-1)
s

1.0

In [74]:
sz = 4
s * sz/(sz-1)

1.0

3