<h2 align="center" style="color:Black"> Геоаналитика </h2>

#### Данное решение является модификацией решения предоставленного участником <b>Evgeny Kazenov</b>.

<b>Решение:</b>





1. Получаем матрицу признаков на активности клиентов в локациях (основное отличие от оригинала здесь. В оригинальном решении на пересечении пользователь-место ставится статистика активности или -1. В моем решении если пересечения не произошло то я пошагово поднимаюсь на менее точный уровень детализации и если пересечение произошло то ставлю число пропорциональное точности. Если пересечения нет то иду дальше к еще менее точным уровням.)
2. Используем кросс валидацию <b>MultilabelStratifiedKFold</b>
3. Для загрузки решения обучаем и сохраняем 7 моделей <b>XGBClassifier</b>
4. Валидация <b>8.788</b>, Паблик <b>8,8686166</b>

<b>Замечание:</b>
Для получения конечных ячеек были опробованы пакеты: <b>h3, geohash, s2cell</b>
В итоге остановился на <b>s2cell</b>












In [1]:
from pathlib import Path
import pandas as pd
import h3
import numpy as np
from tqdm import tqdm
#import json
import joblib
from typing import List
from xgboost import XGBClassifier
from statistics import mean
import warnings
warnings.filterwarnings('ignore')
#import geohash
#import geohash_base as gb
from multiprocessing import Pool
import time
import s2cell


In [2]:
# метрика контеста
def mean_binary_cross_entropy(predictuion, target):
    eps = 1e-8
    mbce = (-np.log(np.clip(predictuion, eps, 1 - eps)) * target \
           - np.log(np.clip(1 - predictuion, eps, 1 - eps)) * (1 - target)).sum(axis=1).mean()
    return mbce

In [3]:
data_root = ''
hexses_target_path = '../data/hexses_target.lst'
hexses_data_path = '../data/hexses_data.lst'

train_data_fn   = '../data/transactions.parquet'
train_target_fn =  '../data/target.parquet'

In [4]:
with open(hexses_target_path, "r") as f:
    hexses_target = [x.strip() for x in f.readlines()]

In [5]:
with open(hexses_data_path, "r") as f:
    hexses_data = [x.strip() for x in f.readlines()]

In [6]:
transactions = pd.read_parquet(train_data_fn)

In [7]:
all_possible_hexses = list(set(hexses_data + hexses_target))
len(all_possible_hexses), len(hexses_data), len(hexses_target)

(8157, 8154, 1657)

In [8]:
%%time
hexses_dic = {}

for hexs_base in tqdm(all_possible_hexses):
    h_levels = []
    for precision in range(15,6,-1):
        lat, lng = h3.h3_to_geo(hexs_base)
        #hexs = h3.geo_to_h3(lat, lng, precision)
        hexs = s2cell.lat_lon_to_cell_id(lat, lng, precision)
        h_levels.append(hexs)
    hexses_dic[hexs_base] = h_levels
    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8157/8157 [00:00<00:00, 16766.34it/s]

CPU times: user 490 ms, sys: 748 µs, total: 491 ms
Wall time: 490 ms





In [10]:
len(hexses_dic['8911aa7a177ffff'])

9

In [11]:
#for i in range(len(hexses_dic['8911aa7a177ffff'])):
print(len(hexses_dic['8911aa7a177ffff']))

9


In [12]:
hexses_dic['8911aa7a177ffff'][0]

5095061855404032000

In [13]:
joblib.dump(hexses_dic, f'hexses_dic.pkl')

['hexses_dic.pkl']

In [14]:
hexses_dic['8911aa7a177ffff']

[5095061855404032000,
 5095061854330290176,
 5095061858625257472,
 5095061910164865024,
 5095061841445388288,
 5095061016811667456,
 5095062116323295232,
 5095066514369806336,
 5095048922183761920]

In [15]:
transactions.head()

Unnamed: 0,h3_09,customer_id,datetime_id,count,sum,avg,min,max,std,count_distinct,mcc_code
0,8911aa4c62fffff,1,3,1,3346.65,3346.65,3346.65,3346.65,,1,13
1,8911aa7b5b3ffff,4,3,1,450.0,450.0,450.0,450.0,,1,8
2,8911aa63623ffff,5,3,10,11035.69,1103.569,59.0,3620.18,1190.530333,6,13
3,8911aa48577ffff,9,2,2,628.0,314.0,295.0,333.0,26.870058,2,5
4,8911aa78297ffff,11,2,1,4155.0,4155.0,4155.0,4155.0,,1,10


In [16]:
target = pd.read_parquet(train_target_fn)
target

Unnamed: 0,h3_09,customer_id
0,8911aa6ac3bffff,23172
1,8911aa7a857ffff,95640
2,8911aa70b97ffff,60350
3,8911aa70b97ffff,69521
4,891181b69abffff,29437
...,...,...
157801,8911aa61cc7ffff,73888
157802,8911aa4ecafffff,81775
157803,8911aa4ecafffff,38508
157804,8911aa63657ffff,14794


In [17]:
target = (
    pd.read_parquet(train_target_fn)
    .assign(customer_id = lambda x: x.customer_id.astype(int))
    .pipe(lambda x: pd.pivot(x.assign(v = 1.), index='customer_id', columns='h3_09', values='v'))
    .pipe(lambda x: x.reindex(sorted(x.columns), axis=1)) # Сортируем столбцы по порядку
    .sort_values(by='customer_id') # сортируем строки
    .fillna(0)
)
target.shape

(69337, 1657)

In [18]:
def comb_F(n_entry): # the argument n is the number of items to select        
    global dump_list, hexses_data, hexses_dic

    
    item = dump_list[n_entry]
    customer_id, data = item[0], item[1]
    
    user_locations = data.h3_09.tolist()
    loc_feature =[]
    

    user_hash_list_1 = []
    user_hash_list_2 = []
    user_hash_list_3 = []
    user_hash_list_4 = []
    user_hash_list_5 = []
    user_hash_list_6 = []
    user_hash_list_7 = []    
    user_hash_list_8 = []    
    user_hash_list_9 = []
    
    for user_loc in user_locations:
        user_hash_list_1.append(hexses_dic[user_loc][0])
        user_hash_list_2.append(hexses_dic[user_loc][1])
        user_hash_list_3.append(hexses_dic[user_loc][2])
        user_hash_list_4.append(hexses_dic[user_loc][3])
        user_hash_list_5.append(hexses_dic[user_loc][4])
        user_hash_list_6.append(hexses_dic[user_loc][5])
        user_hash_list_7.append(hexses_dic[user_loc][6])
        user_hash_list_8.append(hexses_dic[user_loc][7])
        user_hash_list_9.append(hexses_dic[user_loc][8])
    
    
    for bank_loc in hexses_data:
        
        if bank_loc in user_locations:
            loc_count = data[data.h3_09==bank_loc]['count'].sum()# суммируем все активности в локации
            loc_feature.append(loc_count)    
        else:

            bank_hash = hexses_dic[bank_loc][0]
            if bank_hash in user_hash_list_1:                            
                loc_feature.append(0.9)
            else:                   

                bank_hash = hexses_dic[bank_loc][1]
                if bank_hash in user_hash_list_2:                            
                    loc_feature.append(0.8)
                else:                

                    bank_hash = hexses_dic[bank_loc][2]
                    if bank_hash in user_hash_list_3:                            
                        loc_feature.append(0.7)
                    else:    
                        
                        bank_hash = hexses_dic[bank_loc][3]
                        if bank_hash in user_hash_list_4:                            
                            loc_feature.append(0.6)
                        else:

                            bank_hash = hexses_dic[bank_loc][4]
                            if bank_hash in user_hash_list_5:                            
                                loc_feature.append(0.5)
                                
                            else:  

                                bank_hash = hexses_dic[bank_loc][5]
                                if bank_hash in user_hash_list_6:                            
                                    loc_feature.append(0.4) 
                                else:  

                                    bank_hash = hexses_dic[bank_loc][6]
                                    if bank_hash in user_hash_list_7:                            
                                        loc_feature.append(0.3) 
                                    else:  
    
                                        bank_hash = hexses_dic[bank_loc][7]
                                        if bank_hash in user_hash_list_8:                            
                                            loc_feature.append(0.2) 
                                        else:  
            
                                            bank_hash = hexses_dic[bank_loc][8]
                                            if bank_hash in user_hash_list_9:                            
                                                loc_feature.append(0.1) 
                                            else:                                     
                                                loc_feature.append(-1)
                                    

        string_prep = [customer_id]+loc_feature
    return string_prep    

In [19]:
def count_transform(df,  
                    hexses_data,
                    hexses_dic,
                   ):

    global dump_list
    
    """
    Создание матрицы признаков на активности в каждой локации если активности нет заполняется -1

    """
    
    data_features = []
    dump = df.groupby("customer_id")
    dump_list = list(dump)
    POOLS_NUM = 18




    for customer_id, data in tqdm(dump):
        loc_text = []                   
        for i in range(len(hexses_data)):
            loc  = hexses_data[i]
            loc_text.append(loc+'_count')

            
    col_names = ['customer_id']+loc_text




    p = Pool(POOLS_NUM)
    
    
    
    
    end = len(dump_list)
    n_entry = range(end)    
    #n_entry = range(10000)
    train_data = p.map(comb_F, n_entry) # pass the range as the sequence of arguments!
    p.close()
    p.join()


    train_data = pd.DataFrame(train_data, columns = col_names)
    train_data = train_data.sort_values(by=['customer_id'], ascending=True)

    return train_data


In [20]:
%%time
try:
    train_data = joblib.load(f'train_data_S2.pkl')
except: 
    train_data = count_transform(transactions,
                                    hexses_data,
                                    hexses_dic
                            )
    joblib.dump(train_data, f'train_data_S2.pkl')    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 69337/69337 [00:48<00:00, 1428.41it/s]


CPU times: user 4min 41s, sys: 19.9 s, total: 5min 1s
Wall time: 25min 54s


In [22]:
len(train_data)

69337

In [23]:
train_data

Unnamed: 0,customer_id,89118180927ffff_count,89118180d27ffff_count,891181820abffff_count,891181840a7ffff_count,891181844c3ffff_count,89118184c93ffff_count,891181854b7ffff_count,89118186067ffff_count,8911818610bffff_count,...,8911aa7b6b3ffff_count,8911aa7b6bbffff_count,8911aa7b6c3ffff_count,8911aa7b6c7ffff_count,8911aa7b6cbffff_count,8911aa7b6cfffff_count,8911aa7b6d3ffff_count,8911aa7b6d7ffff_count,8911aa7b6dbffff_count,8911aaccacfffff_count
0,1,0.2,0.2,0.2,0.3,0.3,0.3,0.2,0.3,0.3,...,0.7,0.7,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.2
1,4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.7,0.7,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.2
2,5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.2
3,9,0.2,0.2,0.2,0.3,0.3,0.3,0.2,0.3,0.3,...,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.2
4,11,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.7,0.7,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69332,98388,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.2
69333,98397,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.2
69334,98409,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.2
69335,98432,0.2,0.2,0.2,0.3,0.3,0.4,0.2,0.3,0.3,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.2


In [24]:
train_data = train_data.drop(['customer_id'], axis=1)
train_data

Unnamed: 0,89118180927ffff_count,89118180d27ffff_count,891181820abffff_count,891181840a7ffff_count,891181844c3ffff_count,89118184c93ffff_count,891181854b7ffff_count,89118186067ffff_count,8911818610bffff_count,89118186173ffff_count,...,8911aa7b6b3ffff_count,8911aa7b6bbffff_count,8911aa7b6c3ffff_count,8911aa7b6c7ffff_count,8911aa7b6cbffff_count,8911aa7b6cfffff_count,8911aa7b6d3ffff_count,8911aa7b6d7ffff_count,8911aa7b6dbffff_count,8911aaccacfffff_count
0,0.2,0.2,0.2,0.3,0.3,0.3,0.2,0.3,0.3,0.3,...,0.7,0.7,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.2
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.7,0.7,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.2
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.2
3,0.2,0.2,0.2,0.3,0.3,0.3,0.2,0.3,0.3,0.3,...,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.2
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.7,0.7,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69332,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.2
69333,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.2
69334,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.2
69335,0.2,0.2,0.2,0.3,0.3,0.4,0.2,0.3,0.3,0.3,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.2


In [25]:
from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold, MultilabelStratifiedKFold
from sklearn.metrics import log_loss

SEED = 42

train_scores=[]
val_scores = []

MBCE_train_scores=[]
MBCE_val_scores = []

rmskf = MultilabelStratifiedKFold(n_splits=7, shuffle=True, random_state=SEED)

for fold, (train_index, test_index) in enumerate(rmskf.split(train_data, target)):
        
    start = time.time()
    print(f'Fold: {fold}')
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[test_index]
    y_train, y_val = target.iloc[train_index], target.iloc[test_index]

    model = XGBClassifier(n_estimators=400,
                          depth=4,
                          nthread = -1,
                          sampling_method='gradient_based',
                          objective="binary:logistic", 
                          tree_method='gpu_hist',
                          learning_rate = 0.1,
                          reg_lambda = 100
                         )
   
    model.fit(X_train, y_train,eval_set=[(X_val, y_val)],
          verbose=5,early_stopping_rounds=5)
    joblib.dump(model, f'model_{SEED}_{fold}.pkl')
  
            
    train_preds = model.predict_proba(X_train)
    valid_preds = model.predict_proba(X_val) 
            
    train_score = log_loss(y_train, train_preds)
    val_score = log_loss(y_val, valid_preds)
    
    print(f'Результат log_loss на трейн: {train_score}')
    print(f'Результат log_loss на тесте: {val_score}')
    print('----------->')
    print(f'Результат MBCE на трейн: {mean_binary_cross_entropy(train_preds,y_train)}')
    print(f'Результат MBCE на тесте: {mean_binary_cross_entropy(valid_preds,y_val)}')
    
            
    train_scores.append(train_score)
    val_scores.append(val_score)
            
    MBCE_train_scores.append(mean_binary_cross_entropy(train_preds,y_train))
    MBCE_val_scores.append(mean_binary_cross_entropy(valid_preds,y_val))

    end = time.time()
    print('fold:', fold, 'time:', round((end-start)/60, 3))
    print()
    print()
    

Fold: 0
TRAIN: [    0     1     2 ... 69333 69334 69336] TEST: [    4     8    13 ... 69321 69331 69335]
[0]	validation_0-logloss:0.11772
[5]	validation_0-logloss:0.07212
[10]	validation_0-logloss:0.04560
[15]	validation_0-logloss:0.02991
[20]	validation_0-logloss:0.02054
[25]	validation_0-logloss:0.01492
[30]	validation_0-logloss:0.01151
[35]	validation_0-logloss:0.00943
[40]	validation_0-logloss:0.00813
[45]	validation_0-logloss:0.00731
[50]	validation_0-logloss:0.00677
[55]	validation_0-logloss:0.00641
[60]	validation_0-logloss:0.00616
[65]	validation_0-logloss:0.00597
[70]	validation_0-logloss:0.00584
[75]	validation_0-logloss:0.00574
[80]	validation_0-logloss:0.00566
[85]	validation_0-logloss:0.00560
[90]	validation_0-logloss:0.00555
[95]	validation_0-logloss:0.00551
[100]	validation_0-logloss:0.00547
[105]	validation_0-logloss:0.00544
[110]	validation_0-logloss:0.00542
[115]	validation_0-logloss:0.00540
[120]	validation_0-logloss:0.00538
[125]	validation_0-logloss:0.00537
[130]	v

In [26]:
print(f'MBCE на валидации: {sum(MBCE_val_scores)/len(MBCE_val_scores)}')

MBCE на валидации: 8.788004509264512
