In [1]:
import pandas as pd
import folium
import matplotlib.pyplot as plt
%matplotlib inline

import json
import numpy as np

from collections import Counter
from folium.plugins import MarkerCluster
from sklearn.cluster import DBSCAN

from utils import * 

In [2]:
# develop mode
%load_ext autoreload
%autoreload 2

In [3]:
# Moscow coordinates
MOW = [55.75222, 37.61556]

In [4]:
# load mcc codes
mcc_codes = pd.read_csv('./data/mcc_codes.csv', index_col='mcc')

In [None]:
# load, clean and group train data
train_df = pd.read_csv('./data/train_set.csv')
train_df.rename(columns={'pos_adress_lat':'pos_address_lat', 'pos_adress_lon':'pos_address_lon'}, inplace= True)
train_df.loc[:,'mcc'] = clean_mcc(train_df)
train_gr = train_df.groupby('customer_id')
train_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
test_df = pd.read_csv('./data/test_set.csv')
test_df.loc[:,'mcc'] = clean_mcc(test_df)
test_gr = test_df.groupby('customer_id')

In [None]:
mcc_code = 5411

In [None]:
# test DBSCAN
core_samples_mask = np.zeros_like(dbs.labels_, dtype=bool)
core_samples_mask[dbs.core_sample_indices_] = True


unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = position[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = position[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

#plt.title('Estimated number of clusters: %d' % n_clusters_)
#plt.show()

In [None]:
def dbscan_clust(position, home_pos = np.nan, dbscan_params = {'leaf_size':5, 'eps': 0.02, 'n_jobs': 5}):
    dbs = DBSCAN(**dbscan_params)
    dbs.fit(position)
    labels = dbs.labels_

    clust_size = 0
    best_clust = [0, 0]
    clust_size = np.NAN
    d = np.Inf
    
    for label in set(labels):
        if label<0:
            continue
        clust = position[labels == label]

        if clust_size < clust.shape[0]:
            best_clust = np.mean(clust, axis = 0)
            clust_size = clust.shape[0]
            print(best_clust)
            d = haversine(best_clust, home_pos)
            
    return best_clust, clust_size, d

In [None]:
dbscan_params = {'leaf_size':5, 'eps': 0.02, 'n_jobs': 5, 'min_samples': 2}
r = 0.02
r2 = r**2

def get_dbscan_score(df, dbscan_params, mcc_code = 5411, calc_dist = True):
    score = 0
    res = {}
    
    for user_id, data in df:
        test_set = data.query('mcc == @mcc_code')
        position = test_set.loc[:,['pos_address_lat', 'pos_address_lon']].dropna().as_matrix()

        if calc_dist:
            home_pos = test_set.loc[:,['home_add_lat','home_add_lon']].dropna().as_matrix()
            if (len(home_pos) ==0):
                continue
            home_pos = np.unique(home_pos, axis=0)[0]
            if len(np.unique(home_pos, axis=0)) >2:
                raise ValueError('Wrong home position. Check it!!!')
        else:
            home_pos = np.array([90, 90])
        
        clust_size = -1
        d = np.Inf
        best_clust = np.array([90, 90])
        
        if len(position) >1:
            dbs = DBSCAN(**dbscan_params)
            dbs.fit(position)
            labels = dbs.labels_

            for label in set(labels):
                if label<0:
                    continue
                clust = position[labels == label]
                if clust_size < clust.shape[0]:
                    best_clust = np.mean(clust, axis = 0)
                    clust_size = clust.shape[0]
                    if calc_dist:
                        d = haversine(best_clust, home_pos)
                    else:
                        d = np.inf
                    
        res.update({user_id: [best_clust[0], best_clust[1], home_pos[0], home_pos[1], d, clust_size]})
        
    res_df = pd.DataFrame.from_dict(res,  orient='index')
    res_df.columns = ['pr_home_lat', 'pr_home_lon',  'real_home_lat', 'real_home_lon', 'dist', 'clust_size']
        
    return res_df

### Подбор гиперпараметров для DBSCAN

In [None]:
eps_list = np.logspace(-2, -1, 5)
min_samples_list = [5, 10, 15, 20, 25, 30]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

In [None]:
eps_list = np.logspace(-3, -1, 10)
min_samples_list = [1, 2, 3, 4, 5]

res = {}

for min_samples in min_samples_list:
    for eps in eps_list: 
        res_df = get_dbscan_score(customer_gr, {'leaf_size':5, 'eps': eps, 'min_samples': min_samples})
        score = res_df.query('dist < @r2').shape[0]
        print(f'{eps}, {min_samples}, {score}')
        res.update({(eps, min_samples,):score})

Наилучшие результаты получены при eps = 0.036, min_samples = 1

In [24]:
best_dbscan_params = {'eps': 0.036, 'min_samples': 1}

In [36]:
train_pr = get_dbscan_score(train_gr, best_dbscan_params)

In [37]:
train_pr.head()

Unnamed: 0,pr_home_lat,pr_home_lon,real_home_lat,real_home_lon,dist,clust_size
0001f322716470bf9bfc1708f06f00fc,56.251347,43.446254,44.708,37.775,165.411979,7
0007297d86e14bd68bd87b1dbdefe302,55.800632,37.400692,55.799,37.388,0.000164,40
000b709c6c6fb1e8efcfd95e57c2a9de,55.357739,86.075208,54.994,82.864,10.444162,2
0027a7618d97cc9fbda55fac457eaeb7,55.636283,37.21854,55.742,37.575,0.13824,5
002b9f6e118c54f1292e03d1a04d516e,55.804936,37.501629,55.693,37.594,0.021062,3


Теперь оценим реальный скор, который я могу выбить

In [38]:
0.5* train_pr.query('dist <= @r2').shape[0]/train_pr.shape[0]

0.198006576243321

На борде скор 0.206375

In [39]:
test_pr = get_dbscan_score(test_gr, best_dbscan_params, calc_dist= False)
test_pr.head()

Unnamed: 0,pr_home_lat,pr_home_lon,real_home_lat,real_home_lon,dist,clust_size
00021683ccb416637fe9a4cd35e4606e,55.038795,82.97785,90,90,inf,20
0002d0f8a642272b41c292c12ab6e602,53.199818,50.173374,90,90,inf,21
0004d182d9fede3ba2534b2d5e5ad27e,43.57671,39.733473,90,90,inf,17
0008c2445518c9392cb356c5c3db3392,51.533332,46.023607,90,90,inf,2
000b373cc4969c0be8e0933c08da67e1,56.236275,43.460448,90,90,inf,26


In [33]:
submit = test_pr.loc[:,['home_post_lat', 'home_post_lon', 'best_post_lat',  'best_post_lon']]
submit.head()

Unnamed: 0,home_post_lat,home_post_lon,best_post_lat,best_post_lon
00021683ccb416637fe9a4cd35e4606e,90,90,55.038795,82.97785
0002d0f8a642272b41c292c12ab6e602,90,90,53.199818,50.173374
0004d182d9fede3ba2534b2d5e5ad27e,90,90,43.57671,39.733473
0008c2445518c9392cb356c5c3db3392,90,90,51.533332,46.023607
000b373cc4969c0be8e0933c08da67e1,90,90,56.236275,43.460448


In [None]:
submit.columns = ['best_post_lat', 'best_post_lon',  'home_post_lat', 'home_post_lon', 'dist', 'clust_size']

In [None]:
submit.to_csv('1.csv')