In [282]:
import pandas as pd
import folium
import matplotlib.pyplot as plt
%matplotlib inline

import json
import numpy as np

from collections import Counter
from folium.plugins import MarkerCluster
from sklearn.cluster import DBSCAN

from utils import *
from FedRank import FedRank

In [283]:
from sklearn.metrics import hinge_loss

In [284]:
# develop mode
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [285]:
# Moscow coordinates
MOW = [55.75222, 37.61556]

r = 0.02
r2 = r**2

In [286]:
# load, clean and group train data
train_df = clean_df('./data/train_set.csv')
train_gr = train_df.groupby('customer_id')

test_df = clean_df('./data/test_set.csv')
test_gr = test_df.groupby('customer_id')

  if self.run_code(code, result):
  if self.run_code(code, result):


### MCC
Подгружаем MCC коды, вычисляем их частотность в тренировочных данных. Выбираем MMC, которые встречаются более 10 000 раз и сохраняем их в mcc_list.

In [287]:
# load mcc codes
mcc_codes = pd.read_csv('./data/mcc_codes.csv', index_col='mcc')
mcc_counts = train_df['mcc'].value_counts().to_frame(name = 'count')
mcc_codes = mcc_counts.merge(mcc_codes, left_index=True, right_index=True)
mcc_list = mcc_codes.query('count >10000').index.values

In [303]:
best_dbscan_params = {'eps': 0.036, 'min_samples': 2}
clusters = get_dbscan_clust(train_gr, best_dbscan_params, mcc_list=mcc_list)

In [304]:
train_df = train_df.merge(clusters, left_index=True, right_index=True, how='inner')
train_df.clust_label.fillna(-2, inplace=True)
t_gr = train_df.groupby(['customer_id','clust_label'])

In [305]:
train_df.shape

(1074568, 20)

### Создаём допольнительные фичи
Проходимся по всем кластерам всех пользователей и считаем удельный вклад каждого MCC по количеству операций.

На выходе получаем таблицу, в которой колонки соответсвуют MCC

In [306]:
### Create features with relative contributions of each MCC code for each cluster of each customer
def get_mcc_features(gr):
    df = gr.mcc.aggregate('value_counts', **{'normalize':True}).unstack(level=-1).fillna(0)
    df.columns = ["mcc_%s" % (n1) for n1 in df.columns.values]
    return df


def get_amount_features(gr):
    agg_dict = {
    'amount': [np.mean, np.max, np.min, 'count', 'std', 'sum']
    }
    c = gr.aggregate(agg_dict)
    newidx = []
    for (n1,n2) in c.columns.ravel():
        newidx.append("%s_%s" % (n1,n2))
    c.columns=newidx
    
    rel_features = c.loc[:,['amount_sum','amount_count']].groupby(level=0).transform(lambda x: x/x.sum())
    rel_features.columns = [x+'_rel' for x in rel_features.columns]
    
    return c.merge(rel_features, left_index=True, right_index=True)

def get_clust_pos(gr):
    agg_dict = {
        'pos_address_lat': np.mean,
        'pos_address_lon': np.mean
        }
    return gr.aggregate(agg_dict)

In [307]:
train_features = get_amount_features(t_gr).merge(get_mcc_features(t_gr), 
                                                        left_index=True, right_index=True)

In [308]:
dist_dict = {
    'home_add_lat': 'median',
    'home_add_lon': 'median',
    'work_add_lat': 'median',
    'work_add_lon': 'median'
    
}

train_hw_pos = train_df.groupby(['customer_id','clust_label']).aggregate(dist_dict)
train_hw_pos = train_hw_pos.merge(get_clust_pos(t_gr), left_index=True, right_index=True)
train_hw_pos.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,home_add_lat,home_add_lon,work_add_lat,work_add_lon,pos_address_lat,pos_address_lon
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0001f322716470bf9bfc1708f06f00fc,0,44.708,37.775,44.735,37.798,44.728065,37.794945
0001f322716470bf9bfc1708f06f00fc,1,44.708,37.775,44.735,37.798,56.251347,43.446254
0001f322716470bf9bfc1708f06f00fc,2,44.708,37.775,44.735,37.798,0.0,0.0
0007297d86e14bd68bd87b1dbdefe302,-1,55.799,37.388,,,45.102668,38.980771
0007297d86e14bd68bd87b1dbdefe302,0,55.799,37.388,,,55.809137,37.462241
0007297d86e14bd68bd87b1dbdefe302,1,55.799,37.388,,,55.801699,37.403
0007297d86e14bd68bd87b1dbdefe302,2,55.799,37.388,,,55.822604,37.524894
0007297d86e14bd68bd87b1dbdefe302,3,55.799,37.388,,,55.675554,37.505458
0007297d86e14bd68bd87b1dbdefe302,4,55.799,37.388,,,55.748555,37.543497
0007297d86e14bd68bd87b1dbdefe302,5,55.799,37.388,,,55.746633,37.875885


In [309]:
target_dist = get_target_distances(train_hw_pos)
target_rank = target_dist.fillna(999).groupby(level=0).transform(lambda x: (x.rank()==1).astype(int))

In [312]:
target_rank.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,dist_home,dist_work
customer_id,clust_label,Unnamed: 2_level_1,Unnamed: 3_level_1
0001f322716470bf9bfc1708f06f00fc,0,1,1
0001f322716470bf9bfc1708f06f00fc,1,0,0
0001f322716470bf9bfc1708f06f00fc,2,0,0
0007297d86e14bd68bd87b1dbdefe302,-1,0,0
0007297d86e14bd68bd87b1dbdefe302,0,0,0
0007297d86e14bd68bd87b1dbdefe302,1,1,0
0007297d86e14bd68bd87b1dbdefe302,2,0,0
0007297d86e14bd68bd87b1dbdefe302,3,0,0
0007297d86e14bd68bd87b1dbdefe302,4,0,0
0007297d86e14bd68bd87b1dbdefe302,5,0,0


In [313]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [340]:
y_pr = pd.DataFrame(index= train_features.index, columns=['y'])
target_rank = target_rank.dist_home.to_frame()
coeffs = np.random.rand(train_features.shape[1], 1)

def step(df, coeffs):
    y_pr.y  = np.matmul(df,coeffs)
    y2 = y_pr.fillna(0).groupby(level=0).transform(lambda x: softmax(x)).\
           merge(target_rank, left_index=True, right_index=True)
    y2 = y2.assign(p = 1 + y2.y - y2.dist_home).assign(inv = np.abs(y2.dist_home-1))
    y2 = y2.assign(p2 = y2.p*y2.inv)
    gr4 = y2.p2.groupby(level = 0)
    y4 = gr4.aggregate('sum')
    y5 = gr4.aggregate('count')
    score = (y4*y5/(y5-1)).sum()
    return score

In [356]:
# params
eps = 0.01
learning_rate = 0.01
#Init coeffs


feat_num = train_features.shape[1]
#coeffs = 10*np.random.rand(feat_num, 1)
s0 = step(train_features, coeffs)
print(s0)

score_list = []
for i in range(1e5):
    # choose random feature and make a step
    feature_id = np.random.randint(feat_num)
    temp_coeffs = coeffs
    temp_coeffs[feature_id] += eps
    
    # calculate score and gradien
    s_new = step(train_features, temp_coeffs)
    
    coeffs[feature_id] += learning_rate*(s_new-s0)
    
    s0 = step(train_features, coeffs)
    print(s0)
    score_list.append(s0)
    
    if (i % 50 ==0):
        np.save('coeffs.npy',coeffs)
        print('Save coeffs to file')

66881.38045330271


TypeError: 'float' object cannot be interpreted as an integer

### Обучение и тестирование

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Подбор гиперпараметров для DBSCAN

In [None]:
eps_list = np.logspace(-2, -1, 5)
min_samples_list = [5, 10, 15, 20, 25, 30]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

In [None]:
eps_list = np.logspace(-3, -1, 10)
min_samples_list = [1, 2, 3, 4, 5]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

Наилучшие результаты получены при eps = 0.036, min_samples = 1

In [None]:
train_pr = get_dbscan_score(train_gr, best_dbscan_params)

In [None]:
train_pr.head()

In [None]:
train_pr.clust_amount.hist(bins = 100)
plt.xlim([0, 20])

Теперь оценим реальный скор, который я могу выбить

In [None]:
0.5* train_pr.query('dist <= @r2').shape[0]/train_pr.shape[0]

На борде скор 0.206375

In [None]:
test_pr = get_dbscan_score(test_gr, best_dbscan_params, calc_dist= False)
test_pr.head()

In [None]:
submit = test_pr.loc[:,['home_post_lat', 'home_post_lon', 'best_post_lat',  'best_post_lon']]
submit.head()

In [None]:
submit.columns = ['best_post_lat', 'best_post_lon',  'home_post_lat', 'home_post_lon', 'dist', 'clust_size']

In [None]:
submit.to_csv('1.csv')

In [201]:
import pandas as pd
import numpy as np

columns = ['col{}'.format(i) for i in range(36)]
x = pd.DataFrame(np.random.random((1062, 36)), columns=columns)
y = pd.DataFrame(np.random.random((36, 36)))

print(np.dot(x, y).shape)
# (1062, 36)

(1062, 36)


In [90]:
y_tr = [1, 0, 0]
y_pr = [1, 0, 0]
sz = len(y_pr)
s = hinge_loss(y_tr, y_pr) * sz/(sz-1)
s

1.0

In [74]:
sz = 4
s * sz/(sz-1)

1.0

3