In [1]:
import pandas as pd
import folium
import matplotlib.pyplot as plt
%matplotlib inline

import json
import numpy as np

from collections import Counter
from folium.plugins import MarkerCluster
from sklearn.cluster import DBSCAN

from utils import *
from FedRank import FedRank

In [67]:
from sklearn.metrics import hinge_loss

In [2]:
# develop mode
%load_ext autoreload
%autoreload 2

In [3]:
# Moscow coordinates
MOW = [55.75222, 37.61556]

r = 0.02
r2 = r**2

In [56]:
r2

0.0004

In [4]:
# load, clean and group train data
train_df = clean_df('./data/train_set.csv')
train_gr = train_df.groupby('customer_id')

test_df = clean_df('./data/test_set.csv')
test_gr = test_df.groupby('customer_id')

  if self.run_code(code, result):
  if self.run_code(code, result):


### MCC
Подгружаем MCC коды, вычисляем их частотность в тренировочных данных. Выбираем MMC, которые встречаются более 10 000 раз и сохраняем их в mcc_list.

In [5]:
# load mcc codes
mcc_codes = pd.read_csv('./data/mcc_codes.csv', index_col='mcc')
mcc_counts = train_df['mcc'].value_counts().to_frame(name = 'count')
mcc_codes = mcc_counts.merge(mcc_codes, left_index=True, right_index=True)
mcc_list = mcc_codes.query('count >10000').index.values

In [6]:
mcc_codes.loc[:,['count','edited_description']].head(20)

Unnamed: 0,count,edited_description
5411,391635,"Grocery Stores, Supermarkets"
6011,275228,Financial Institutions – Manual Cash Disbursem...
5814,128771,Fast Food Restaurants
5812,62407,Eating places and Restaurants
5499,44703,Misc. Food Stores – Convenience Stores and Spe...
5541,42296,Service Stations ( with or without ancillary s...
5912,41175,Drug Stores and Pharmacies
4111,24899,Local/Suburban Commuter Passenger Transportati...
5921,18353,"Package Stores – Beer, Wine, and Liquor"
5331,13042,Variety Stores


In [7]:
best_dbscan_params = {'eps': 0.036, 'min_samples': 2}
clusters = get_dbscan_clust(train_gr, best_dbscan_params, mcc_list=mcc_list)

In [8]:
train_df = train_df.merge(clusters, left_index=True, right_index=True, how='left')
train_df.clust_label.fillna(-2, inplace=True)
t_gr = train_df.groupby(['customer_id','clust_label'])

In [9]:
train_df.head(5)

Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,clust_label
0,2.884034,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,0
1,2.775633,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,0
2,3.708368,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.858198,30.229024,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,-2
3,2.787498,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,0
4,2.89251,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,0


### Создаём допольнительные фичи
Проходимся по всем кластерам всех пользователей и считаем удельный вклад каждого MCC по количеству операций.

На выходе получаем таблицу, в которой колонки соответсвуют MCC

In [10]:
### Create features with relative contributions of each MCC code for each cluster of each customer
def get_mcc_features(gr):
    df = gr.mcc.aggregate('value_counts', **{'normalize':True}).unstack(level=-1).fillna(0)
    df.columns = ["mcc_%s" % (n1) for n1 in df.columns.values]
    return df


def get_amount_features(gr):
    agg_dict = {
    'amount': [np.mean, np.max, np.min, 'count', 'std', 'sum']
    }
    c = gr.aggregate(agg_dict)
    newidx = []
    for (n1,n2) in c.columns.ravel():
        newidx.append("%s_%s" % (n1,n2))
    c.columns=newidx
    
    rel_features = c.loc[:,['amount_sum','amount_count']].groupby(level=0).transform(lambda x: x/x.sum())
    rel_features.columns = [x+'_rel' for x in rel_features.columns]
    
    return c.merge(rel_features, left_index=True, right_index=True)

def get_clust_pos(gr):
    agg_dict = {
        'pos_address_lat': np.mean,
        'pos_address_lon': np.mean
        }
    return gr.aggregate(agg_dict)

In [13]:
train_features = get_amount_features(t_gr).merge(get_mcc_features(t_gr), 
                                                        left_index=True, right_index=True)

In [14]:
train_features

Unnamed: 0_level_0,Unnamed: 1_level_0,amount_mean,amount_amax,amount_amin,amount_count,amount_std,amount_sum,amount_sum_rel,amount_count_rel,mcc_742,mcc_763,...,mcc_8398,mcc_8661,mcc_8699,mcc_8911,mcc_8999,mcc_9211,mcc_9222,mcc_9311,mcc_9399,mcc_9402
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0001f322716470bf9bfc1708f06f00fc,-2,2.787178,3.143099,2.565649,6,0.241672,16.723068,0.052284,0.060000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0001f322716470bf9bfc1708f06f00fc,0,3.309715,4.614833,1.949756,37,0.868873,122.459461,0.382866,0.370000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0001f322716470bf9bfc1708f06f00fc,1,2.553616,2.973381,2.347082,7,0.233241,17.875312,0.055887,0.070000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0001f322716470bf9bfc1708f06f00fc,2,3.255826,4.481342,1.982628,50,0.754765,162.791306,0.508963,0.500000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,-2,2.818547,4.696914,1.939114,32,0.578812,90.193492,0.130006,0.129555,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,-1,2.351594,2.351594,2.351594,1,,2.351594,0.003390,0.004049,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,0,2.491890,3.197908,1.859394,7,0.534511,17.443233,0.025143,0.028340,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,1,2.792455,4.590551,0.706783,81,0.685721,226.188875,0.326031,0.327935,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,2,2.820427,3.401110,2.359619,3,0.530993,8.461280,0.012196,0.012146,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,3,2.367055,2.712781,1.483227,14,0.352710,33.138769,0.047767,0.056680,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0


In [19]:
train_df.head()

Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,clust_label
0,2.884034,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,0
1,2.775633,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,0
2,3.708368,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.858198,30.229024,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,-2
3,2.787498,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,0
4,2.89251,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851,30.232,5261,,59.844072,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,0


In [27]:
dist_dict = {
    'home_add_lat': 'median',
    'home_add_lon': 'median',
    'work_add_lat': 'median',
    'work_add_lon': 'median'
    
}

train_hw_pos = train_df.groupby(['customer_id','clust_label']).aggregate(dist_dict)
train_hw_pos = train_hw_pos.merge(get_clust_pos(t_gr), left_index=True, right_index=True)
train_hw_pos.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,home_add_lat,home_add_lon,work_add_lat,work_add_lon,pos_address_lat,pos_address_lon
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0001f322716470bf9bfc1708f06f00fc,-2,44.708,37.775,44.735,37.798,44.745117,37.7265
0001f322716470bf9bfc1708f06f00fc,0,44.708,37.775,44.735,37.798,44.728065,37.794945
0001f322716470bf9bfc1708f06f00fc,1,44.708,37.775,44.735,37.798,56.251347,43.446254
0001f322716470bf9bfc1708f06f00fc,2,44.708,37.775,44.735,37.798,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,-2,55.799,37.388,,,56.019628,37.070111
0007297d86e14bd68bd87b1dbdefe302,-1,55.799,37.388,,,45.102668,38.980771
0007297d86e14bd68bd87b1dbdefe302,0,55.799,37.388,,,55.809137,37.462241
0007297d86e14bd68bd87b1dbdefe302,1,55.799,37.388,,,55.801699,37.403
0007297d86e14bd68bd87b1dbdefe302,2,55.799,37.388,,,55.822604,37.524894
0007297d86e14bd68bd87b1dbdefe302,3,55.799,37.388,,,55.675554,37.505458


In [None]:
target_dist = get_target_distances(train_hw_pos)
target_rank = target_dist.fillna(999).groupby(level=0).transform(lambda x: (x.rank()==1).astype(int))

In [60]:
target_rank

Unnamed: 0_level_0,Unnamed: 1_level_0,dist_home,dist_work
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1
0001f322716470bf9bfc1708f06f00fc,-2,0,0
0001f322716470bf9bfc1708f06f00fc,0,1,1
0001f322716470bf9bfc1708f06f00fc,1,0,0
0001f322716470bf9bfc1708f06f00fc,2,0,0
0007297d86e14bd68bd87b1dbdefe302,-2,0,0
0007297d86e14bd68bd87b1dbdefe302,-1,0,0
0007297d86e14bd68bd87b1dbdefe302,0,0,0
0007297d86e14bd68bd87b1dbdefe302,1,1,0
0007297d86e14bd68bd87b1dbdefe302,2,0,0
0007297d86e14bd68bd87b1dbdefe302,3,0,0


In [91]:
test_user = '0001f322716470bf9bfc1708f06f00fc'
#test_user = '0007297d86e14bd68bd87b1dbdefe302'
y_true = target_rank.loc[test_user,'dist_home']
y_pr = np.zeros(y_true.shape)
y_pr[0] = 1
print(y_true.shape, hinge_loss(y_true, y_pr) /(1-1.0/y_true.shape[0]))

(4,) 1.6666666666666667


In [106]:
softmax(np.array([[1, 0, 0, 0.1]]))

array([[0.46678181, 0.17171943, 0.17171943, 0.18977932]])

In [220]:
coeffs = np.random.rand(train_features.shape[1], 1)

In [266]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [269]:
y_pr = pd.DataFrame(np.matmul(train_features,coeffs), index= train_features.index, columns=['y'])
y_pr2 = y_pr.fillna(0).groupby(level=0).transform(lambda x: softmax(x))
y_pr2

Unnamed: 0_level_0,Unnamed: 1_level_0,y
customer_id,clust_label,Unnamed: 2_level_1
0001f322716470bf9bfc1708f06f00fc,-2,1.928413e-14
0001f322716470bf9bfc1708f06f00fc,0,1.038394e-04
0001f322716470bf9bfc1708f06f00fc,1,2.615784e-14
0001f322716470bf9bfc1708f06f00fc,2,9.998962e-01
0007297d86e14bd68bd87b1dbdefe302,-2,8.666884e-19
0007297d86e14bd68bd87b1dbdefe302,-1,4.889978e-30
0007297d86e14bd68bd87b1dbdefe302,0,9.122089e-27
0007297d86e14bd68bd87b1dbdefe302,1,1.822909e-04
0007297d86e14bd68bd87b1dbdefe302,2,8.393982e-28
0007297d86e14bd68bd87b1dbdefe302,3,7.892106e-25


In [278]:
y3 = y_pr2.merge(target_rank.dist_home.to_frame(), left_index=True, right_index=True)

In [280]:
def f(x):
    return x

In [281]:
y3.apply(f, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,y,dist_home
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1
0001f322716470bf9bfc1708f06f00fc,-2,1.928413e-14,0.0
0001f322716470bf9bfc1708f06f00fc,0,1.038394e-04,1.0
0001f322716470bf9bfc1708f06f00fc,1,2.615784e-14,0.0
0001f322716470bf9bfc1708f06f00fc,2,9.998962e-01,0.0
0007297d86e14bd68bd87b1dbdefe302,-2,8.666884e-19,0.0
0007297d86e14bd68bd87b1dbdefe302,-1,4.889978e-30,0.0
0007297d86e14bd68bd87b1dbdefe302,0,9.122089e-27,0.0
0007297d86e14bd68bd87b1dbdefe302,1,1.822909e-04,1.0
0007297d86e14bd68bd87b1dbdefe302,2,8.393982e-28,0.0
0007297d86e14bd68bd87b1dbdefe302,3,7.892106e-25,0.0


In [117]:
coeffs = np.random.rand(sz, 1)
coeffs.shape

(252, 1)

In [180]:
d = train_features.loc[[test_user],:]
s1 = np.matmul(d,coeffs)
s1[:,0]

array([ 24.43966138, 136.04210817,  25.52307444, 179.55342921])

In [169]:
softmax([s1-s1.max()])

array([[[4.31465985e-68],
        [1.26845005e-19],
        [1.27487286e-67],
        [1.00000000e+00]]])

In [170]:
s1-s1.max()

array([[-155.11376783],
       [ -43.51132105],
       [-154.03035477],
       [   0.        ]])

In [159]:
def softmax2(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

In [164]:
x2 = np.array([[1.0, 2, 3, 6],  # sample 1
               [2, 4.0, 5, 6],  # sample 2
               [1, 2, 3, 6]]) # sample 1 again(!)

In [165]:
softmax(x2)

array([[0.00626879, 0.01704033, 0.04632042, 0.93037047],
       [0.01203764, 0.08894682, 0.24178252, 0.65723302],
       [0.00626879, 0.01704033, 0.04632042, 0.93037047]])

In [73]:
ranker = FedRank()

In [80]:
features = ['dist_home', 'dist_work']
y = ranker.transform(p, features)
y.head(10)

Unnamed: 0,dist_home_rank,dist_work_rank
1,3,3
2,1,2
3,2,1
5,1,2
6,2,2
7,4,2
8,3,2
11,2,2
12,1,1
15,3,2


In [83]:
features = ['amount_mean', 'amount_count', 'amount_std']
X = ranker.transform(p, features)
X.head()

Unnamed: 0,amount_mean_rank,amount_count_rank,amount_std_rank
1,2,2,2
2,3,1,3
3,1,2,1
5,4,4,4
6,3,1,3


In [84]:
y.head()

Unnamed: 0,dist_home_rank,dist_work_rank
1,3,3
2,1,2
3,2,1
5,1,2
6,2,2


### Обучение и тестирование

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Подбор гиперпараметров для DBSCAN

In [None]:
eps_list = np.logspace(-2, -1, 5)
min_samples_list = [5, 10, 15, 20, 25, 30]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

In [None]:
eps_list = np.logspace(-3, -1, 10)
min_samples_list = [1, 2, 3, 4, 5]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

Наилучшие результаты получены при eps = 0.036, min_samples = 1

In [None]:
train_pr = get_dbscan_score(train_gr, best_dbscan_params)

In [None]:
train_pr.head()

In [None]:
train_pr.clust_amount.hist(bins = 100)
plt.xlim([0, 20])

Теперь оценим реальный скор, который я могу выбить

In [None]:
0.5* train_pr.query('dist <= @r2').shape[0]/train_pr.shape[0]

На борде скор 0.206375

In [None]:
test_pr = get_dbscan_score(test_gr, best_dbscan_params, calc_dist= False)
test_pr.head()

In [None]:
submit = test_pr.loc[:,['home_post_lat', 'home_post_lon', 'best_post_lat',  'best_post_lon']]
submit.head()

In [None]:
submit.columns = ['best_post_lat', 'best_post_lon',  'home_post_lat', 'home_post_lon', 'dist', 'clust_size']

In [None]:
submit.to_csv('1.csv')

In [201]:
import pandas as pd
import numpy as np

columns = ['col{}'.format(i) for i in range(36)]
x = pd.DataFrame(np.random.random((1062, 36)), columns=columns)
y = pd.DataFrame(np.random.random((36, 36)))

print(np.dot(x, y).shape)
# (1062, 36)

(1062, 36)
