# Геоаналитика

В качестве данных предоставлена гео-активность (geostream) по части клиентов Банка за 12 месяцев.  
    

Цель: предсказать для каждого пользователя взятие/ невзятие каждого из четырех продуктов **в течение месяца после отчетной даты**, исторические данные по ним находятся в targets

## Данные
Гео данные представляют собой гео-активность (geostream) по части клиентов Банка за 12 месяцев.  


### Geostream
|title|description|
|---|---|
|client_id|id клиента|
|event_time|Дата-время события|
|geohash_4|Геохеш уровня 4|
|geohash_5|Геохеш уровня 5|
|geohash_6|Геохеш уровня 6|

In [2]:
import pandas as pd
from pandas.api.types import is_float_dtype, is_integer_dtype
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from scipy.stats import skew, mode
import gc
import glob
import pyarrow.parquet as pq
from tqdm import trange, tqdm

  from pandas.core import (


In [3]:
from typing import List

In [4]:
# Включаем tqdm для pandas, чтобы можно было запускать progress_apply() вместо простого apply()
tqdm.pandas() 
pd.options.display.max_columns = None
pd.options.display.max_rows = 200

In [5]:
PATH = ''
PATH_DATASET = PATH + 'datasets/sber_source/'
PATH_DATASET_OUTPUT = PATH + 'datasets/'
PATH_DATASET_GEO_TRAIN = PATH_DATASET + 'geo_train.parquet/'
PATH_DATASET_GEO_TEST = PATH_DATASET + 'geo_test.parquet/'

PATH_DATASET_TARGET_TRAIN = PATH_DATASET + 'train_target.parquet/'
PATH_DATASET_TARGET_TEST = PATH_DATASET + 'test_target_b.parquet/'


In [6]:
# Файлы ГЕО паркетов тест и трейн
train_geo_files = glob.glob(PATH_DATASET_GEO_TRAIN + '/*.parquet')
test_geo_files = glob.glob(PATH_DATASET_GEO_TEST + '/*.parquet')

# Файлы таргеты
train_target_files = glob.glob(PATH_DATASET_TARGET_TRAIN + '/*.parquet')
test_target_files = glob.glob(PATH_DATASET_TARGET_TEST + '/*.parquet')

len(train_geo_files), len(test_geo_files)

(31, 6)

In [7]:
%%time
# загружаем сэмплерированные данные, для которых будем рассчитывать фичи
smpl_Client_Month_df = pq.read_table(PATH_DATASET_OUTPUT + 'result_sample_Client_Month_df_12_06_2024.parquet').to_pandas()
smpl_Client_Month_df = smpl_Client_Month_df.set_index(['client_id', 'report_next_end'])
smpl_Client_Month_df.shape

CPU times: total: 1.89 s
Wall time: 2.52 s


(747847, 1)

In [8]:
# Загрузка списка файлов (типа паркет) в один датафрейм
def load_df_by_files(files:list[str]) -> pd.DataFrame:
    union_df = pd.DataFrame()
    for file in tqdm(files):
        current_df = pq.read_table(file).to_pandas()    
        union_df = pd.concat([union_df, current_df])
    return union_df

In [11]:
%%time
# Загружаем все таргеты
all_target_df = load_df_by_files(train_target_files + test_target_files)
all_target_df.shape

100%|██████████| 22/22 [00:06<00:00,  3.23it/s]

CPU times: total: 7.17 s
Wall time: 6.81 s





(11686066, 6)

In [12]:
all_target_df = all_target_df.drop_duplicates(subset=['mon', 'client_id'])
all_target_df = all_target_df.rename(columns={'mon': 'report_next_end'})
all_target_df.shape

(11654375, 6)

In [1]:
%%time
# Загружаем все гео
all_geo_df = load_df_by_files(train_geo_files + test_geo_files)
all_geo_df.shape

In [None]:
gc.collect()

## Блок расчета статистики, Определяем клиентов по которым нет данных вообще

In [32]:
%%time
buy_clinets_df = all_target_df.groupby('client_id').agg(
    target_1 = ('target_1', max),
    target_2 = ('target_2', max),
    target_3 = ('target_3', max),
    target_4 = ('target_4', max),
).reset_index()
buy_clinets_df['is_target'] = buy_clinets_df[['target_1', 'target_2', 'target_3', 'target_4']].max(axis=1)
buy_clinets_df



CPU times: total: 4.7 s
Wall time: 5.43 s


Unnamed: 0,client_id,target_1,target_2,target_3,target_4,is_target
0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,1,0,1
1,00000c9536a42b45ca93288862cddcbb52a3e1e76f8684...,0,0,0,0,0
2,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,0,0,0,0,0
3,000030a4067420da425d21ea72d5e647d26cf279e55179...,0,0,0,0,0
4,00004427740977a56f391bc2bbc636803ed933205228f8...,0,0,0,0,0
...,...,...,...,...,...,...
994375,ffffa99ee602d379ea65e0fbdbfb0c82ed074e28cd3ada...,0,0,0,0,0
994376,ffffa9af8a057b55b18af946e157391cd9f5a5fd9b61cb...,0,0,0,1,1
994377,ffffab5f6ae1c8d04d83ef12e2ad803298737992698079...,1,0,0,0,1
994378,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,0


In [33]:
target_clients_class_1 = set(buy_clinets_df[buy_clinets_df['target_1'] == 1]['client_id'].unique())
target_clients_class_2 = set(buy_clinets_df[buy_clinets_df['target_2'] == 1]['client_id'].unique())
target_clients_class_3 = set(buy_clinets_df[buy_clinets_df['target_3'] == 1]['client_id'].unique())
target_clients_class_4 = set(buy_clinets_df[buy_clinets_df['target_4'] == 1]['client_id'].unique())
target_clients_class_any = set(buy_clinets_df[buy_clinets_df['is_target'] == 1]['client_id'].unique())
target_clients_class_null_2 = set(buy_clinets_df['client_id'].unique()) - target_clients_class_any
target_clients_class_null = set(buy_clinets_df[buy_clinets_df['is_target'] == 0]['client_id'].unique())
target_clients = set(buy_clinets_df['client_id'].unique())

len(target_clients_class_1), len(target_clients_class_2), len(target_clients_class_3), len(target_clients_class_4), len(target_clients_class_any), len(target_clients_class_null), len(target_clients_class_null_2), len(target_clients)

(93048, 10056, 80481, 48172, 199770, 794610, 794610, 994380)

In [34]:
# set(target_clients) - set(pd.read_csv('list_uniq_client_target.csv'))
# target_clients
# target_clients.to_csv('list_uniq_client_target.csv', index=False)

In [35]:
geo_clients = set(pd.read_csv('list_uniq_client_geo.csv')['client_id'].unique())
dialog_clients = set(pd.read_csv('list_uniq_client_dialog.csv')['client_id'].unique())
target_clients = set(pd.read_csv('list_uniq_client_target.csv')['client_id'].unique())
trx_clients = set(pd.read_csv('list_uniq_client_trx.csv')['client_id'].unique())
submitе_clients = set(pd.read_csv('sample_submission.csv')['client_id'].unique())
# ((789051, 1), (408228, 1), (994380, 1), (1081371, 1), (140488, 6))
len(geo_clients), len(dialog_clients), len(target_clients), len(trx_clients), len(submitе_clients)

(789051, 408228, 994380, 1081371, 140488)

In [36]:
all_clients = set(geo_clients | dialog_clients | target_clients | trx_clients | submitе_clients)
all_clients_with_out_submit = set(geo_clients | dialog_clients | target_clients | trx_clients )

print(f'geo_clients: {len(geo_clients)}')
print(f'dialog_clients: {len(dialog_clients)}')
print(f'target_clients: {len(target_clients)}')
print(f'trx_clients: {len(trx_clients)}')
print(f'submitе_clients: {len(submitе_clients)}')
print(f'all_clients: {len(all_clients)}')
print(f'all_clients_with_out_submit: {len(all_clients_with_out_submit)}')

geo_clients: 789051
dialog_clients: 408228
target_clients: 994380
trx_clients: 1081371
submitе_clients: 140488
all_clients: 1439716
all_clients_with_out_submit: 1439716


In [37]:
clients_dict = {'geo_clients': geo_clients, 'dialog_clients': dialog_clients, 'target_clients': target_clients, 'trx_clients': trx_clients, 'submitе_clients': submitе_clients, }
for f_type_cl in clients_dict:
    for s_type_cl in clients_dict:
        # print(f'{f_type_cl} - {s_type_cl} => {len(clients_dict[f_type_cl]& clients_dict[s_type_cl])}')
        pass
#         print()

In [38]:
clients_first = {'target_1': target_clients_class_1, 'target_2': target_clients_class_2, 'target_3': target_clients_class_3, 'target_4': target_clients_class_4}
clients_dict = {'geo_clients': geo_clients, 'dialog_clients': dialog_clients, 'target_clients': target_clients, 'trx_clients': trx_clients, 'submitе_clients': submitе_clients, }
for f_type_cl in clients_first:
# for f_type_cl in clients_dict:
    for s_type_cl in clients_dict:
        # print(f'{f_type_cl} - {s_type_cl} => {len(clients_first[f_type_cl]& clients_dict[s_type_cl])}')
        pass
#         print()

In [21]:
clients_dict_2 = {'geo_clients': geo_clients, 'dialog_clients': dialog_clients, 'target_clients': target_clients, 'trx_clients': trx_clients, 'submiе_clients': submiе_clients, }
clients_dict = {'geo_clients': geo_clients, 'dialog_clients': dialog_clients, 'trx_clients': trx_clients, }
for f_type_cl in clients_dict_2:
    check_set = clients_dict_2[f_type_cl].copy()
    for s_type_cl in clients_dict:
#         print(f'{f_type_cl} - {s_type_cl} => {len(clients_dict[f_type_cl]& clients_dict[s_type_cl])}')
        check_set -= clients_dict[s_type_cl]
    print(f'{f_type_cl}: {len(check_set)}')

NameError: name 'submiе_clients' is not defined

In [None]:
# len(geo_clients - trx_clients - geo_clients)
len(target_clients - trx_clients - geo_clients - dialog_clients)
# submiе_clients
# 29327
# 129477

#### Определяем клиентов по которым нет данных вообще

In [39]:
len(target_clients_class_any), len(target_clients_class_null)

(199770, 794610)

In [40]:
print(f'geo_clients: {len(geo_clients)}')
print(f'dialog_clients: {len(dialog_clients)}')
print(f'target_clients: {len(target_clients)}')
print(f'trx_clients: {len(trx_clients)}')
print(f'submitе_clients: {len(submitе_clients)}')
print(f'all_clients: {len(all_clients)}')
print(f'all_clients_with_out_submit: {len(all_clients_with_out_submit)}')

geo_clients: 789051
dialog_clients: 408228
target_clients: 994380
trx_clients: 1081371
submitе_clients: 140488
all_clients: 1439716
all_clients_with_out_submit: 1439716


In [41]:
# Определяем клиентов по которым нет таргетов, таких надо выбрасывать из обучения сразу
print(f'geo_clients - all_client: {len(geo_clients - all_clients)}')
print(f'dialog_clients - all_client: {len(dialog_clients - all_clients)}')
print(f'target_clients - all_client: {len(target_clients - all_clients)}')
print(f'trx_clients - all_client: {len(trx_clients - all_clients)}')
print(f'submitе_clients - all_client: {len(submitе_clients - all_clients)}')
print(f'all_clients - all_client: {len(all_clients - all_clients)}')
print(f'all_clients_with_out_submit - all_client: {len(all_clients_with_out_submit - all_clients)}')

geo_clients - all_client: 0
dialog_clients - all_client: 0
target_clients - all_client: 0
trx_clients - all_client: 0
submitе_clients - all_client: 0
all_clients - all_client: 0
all_clients_with_out_submit - all_client: 0


In [43]:
len(target_clients_class_any | target_clients_class_null)

AttributeError: 'set' object has no attribute '__name__'

In [57]:
not_geo = all_clients - geo_clients
not_trx = all_clients - trx_clients
not_dlg = all_clients - dialog_clients
not_trg = all_clients - target_clients 

not_geo_trx = (all_clients - geo_clients) - trx_clients
not_geo_dlg = (all_clients - geo_clients) - dialog_clients
not_dlg_trx = (all_clients - dialog_clients) - trx_clients
not_geo_dlg_trx = ((all_clients - geo_clients) - dialog_clients) - trx_clients


type_data = 'not_geo'
# type_data = 'trg'
select_set_clients = not_geo.copy()
print(f'Всего: {len(select_set_clients)}')
print(f'Нет в {type_data} и они НЕ покупали любой Продукт: {len(select_set_clients - target_clients_class_any)}')
print(f'Нет в {type_data} и они покупали любой Продукт: {len(target_clients_class_any & select_set_clients)}')
print(f'Нет в {type_data} и они НЕ покупали Продукт 1: {len(select_set_clients - target_clients_class_1)}')
print(f'Нет в {type_data} и они покупали Продукт 1: {len(target_clients_class_1 & select_set_clients)}')
print(f'Нет в {type_data} и они НЕ покупали Продукт 2: {len(select_set_clients - target_clients_class_2)}')
print(f'Нет в {type_data} и они покупали Продукт 2: {len(target_clients_class_2 & select_set_clients)}')
print(f'Нет в {type_data} и они НЕ покупали Продукт 3: {len(select_set_clients - target_clients_class_3)}')
print(f'Нет в {type_data} и они покупали Продукт 3: {len(target_clients_class_3 & select_set_clients)}')
print(f'Нет в {type_data} и они НЕ покупали Продукт 4: {len(select_set_clients - target_clients_class_4)}')
print(f'Нет в {type_data} и они покупали Продукт 4: {len(target_clients_class_4 & select_set_clients)}')


# print(f'Нет в {type_data} и они НЕ покупали любой Продукт: {len(target_clients_class_null & select_set_clients)}')
len(select_set_clients), len(select_set_clients - target_clients_class_any), len(target_clients_class_any & select_set_clients), len(select_set_clients - target_clients_class_1), len(target_clients_class_1 & select_set_clients), len(select_set_clients - target_clients_class_2), len(target_clients_class_2 & select_set_clients), len(select_set_clients - target_clients_class_3), len(target_clients_class_3 & select_set_clients), len(select_set_clients - target_clients_class_4), len(target_clients_class_4 & select_set_clients), 

Всего: 650665
Нет в not_geo и они НЕ покупали любой Продукт: 610078
Нет в not_geo и они покупали любой Продукт: 40587
Нет в not_geo и они НЕ покупали Продукт 1: 630174
Нет в not_geo и они покупали Продукт 1: 20491
Нет в not_geo и они НЕ покупали Продукт 2: 648693
Нет в not_geo и они покупали Продукт 2: 1972
Нет в not_geo и они НЕ покупали Продукт 3: 635912
Нет в not_geo и они покупали Продукт 3: 14753
Нет в not_geo и они НЕ покупали Продукт 4: 641812
Нет в not_geo и они покупали Продукт 4: 8853


(650665,
 610078,
 40587,
 630174,
 20491,
 648693,
 1972,
 635912,
 14753,
 641812,
 8853)

In [None]:
# pd.DataFrame(not_trx, columns=['client_id']).to_csv('client_without_trx.csv', index=False)
# pd.DataFrame(not_geo, columns=['client_id']).to_csv('client_without_geo.csv', index=False)
# pd.DataFrame(not_dlg, columns=['client_id']).to_csv('client_without_dlg.csv', index=False)

In [None]:
len(not_trx & submitе_clients)

In [None]:
print(f'Клиенты у которых нет гео {not_geo}, нет гео из тех кто покупали проудкт: {} и из тех кто не покупал: {} ')

In [None]:
len(all_clients - target_clients_class_any)

In [None]:
clients_first = {'target_1': target_clients_class_1, 'target_2': target_clients_class_2, 'target_3': target_clients_class_3, 'target_4': target_clients_class_4}
clients_dict = {'geo_clients': geo_clients, 'dialog_clients': dialog_clients, 'target_clients': target_clients, 'trx_clients': trx_clients, 'submitе_clients': submitе_clients, }
for f_type_cl in clients_first:
# for f_type_cl in clients_dict:
    for s_type_cl in clients_dict:
        # print(f'{f_type_cl} - {s_type_cl} => {len(clients_first[f_type_cl]& clients_dict[s_type_cl])}')
        pass
#         print()

In [None]:
# Определяем клиентов по которым нет данных вообще

#### Завершили статистику возвращаемся обратно в геофичи

In [12]:
%%time
# Уменьшение размера датафрейма, для таргетов, транзакцй и для фичей
def series_to_int(col_df:pd.Series):
    """
    Перевод в целочисленные типы
    """
    min_val = col_df.min()
    max_val = col_df.max()
    if min_val >= -128 and max_val <= 127:
        col_df = col_df.astype('int8')
    elif min_val >= -32768 and max_val <= 32767:
        col_df = col_df.astype('int16')
    elif min_val >= -2147483648 and max_val <= 2147483647:
        col_df = col_df.astype('int32')
    else:
        col_df = col_df.astype('int64')
    return col_df

def compression_df(df:pd.DataFrame(), datetime_cols:List[str]=[], category_cols:List[str]=[]):
    """
    Уменьшение размера датафрейма, для таргетов, транзакцй и для фичей
    """
    float64_cols = list(df.select_dtypes(include='float64'))  
    df[float64_cols] = df[float64_cols].astype('float32')
    for col in df.columns:
        if col in category_cols:
            df[col] = df[col].astype('category')
        elif col in datetime_cols:
            if df[col].dtypes == 'object':
                df[col] = pd.to_datetime(df[col])
        # Если колонка содержит числа 
        elif is_integer_dtype(df[col]):
            if df[col].dtypes == 'int8':
                continue
            else:
                df[col] = series_to_int(df[col])
        elif is_float_dtype(df[col]):
            # Возможно ли перевести в число
            if np.array_equal(df[col].fillna(0), df[col].fillna(0).astype(int)):
                df[col] = df[col].fillna(0)
                df[col] = series_to_int(df[col])
    return df


CPU times: total: 0 ns
Wall time: 9.61 ms


In [13]:
all_geo_df = compression_df(all_geo_df, 
                            datetime_cols=['report_end' ,'report_next_end', 'event_time'],
                           )
all_target_df = compression_df(all_target_df, 
                            datetime_cols=['report_end' ,'report_next_end'],
                           )
all_geo_df.shape, all_target_df.shape

((667326271, 5), (11654375, 6))

In [11]:
# Сравнение распределения ответов таргета и теста (бейзлайн)
# vc_ser = train_target_df['target_4'].value_counts()
# vc_ser.iloc[1]/vc_ser.iloc[0]*100 # 1=0,87 / 2=0.09 / 3=0.7 / 4=0,47
# vc_ser = test_target_df['target_4'].value_counts()
# vc_ser.iloc[1]/vc_ser.iloc[0]*100 # 1=1.3 / 2=0.14 / 3=1.12 / 4=0,72

In [12]:
# %%time
# geo_train_df = load_df_by_files(train_geo_files)
# geo_train_df.shape

In [13]:
# %%time
# geo_test_df = load_df_by_files(test_geo_files)
# geo_test_df.shape

In [14]:
# %%time
# Объединяем все известные данные по геообъектам
# all_geo_train_df = pd.concat([geo_train_df, geo_test_df])
# del geo_train_df
# del geo_test_df
# all_geo_train_df.shape

In [14]:
# min_date = all_geo_train_df['event_time'].min()
# max_date = all_geo_train_df['event_time'].max()
# min_date, max_date
# all_geo_df['event_time'].min(), all_geo_df['event_time'].max()

(Timestamp('2021-12-31 21:00:00.018790'),
 Timestamp('2022-12-20 22:53:44.163401'))

In [15]:
# Добавляем временные колонки в сэпмплирвоанные данные, чтобы легче искать по индексу и отдельно по клиенту и по месяцу
smpl_Client_Month_df = smpl_Client_Month_df.reset_index()
smpl_Client_Month_df['col_client_id'] = smpl_Client_Month_df['client_id']
smpl_Client_Month_df['col_report_next_end'] = smpl_Client_Month_df['report_next_end']
smpl_Client_Month_df = smpl_Client_Month_df.set_index(['client_id', 'report_next_end'])
smpl_Client_Month_df.shape

(747847, 3)

In [69]:
%%time
# Формируем статистику посещений каждого терминала уникальными клиентами и общее кол-во посещений
start_date = datetime(2022, 1, 1, 0, 0, 0)
# end_date = datetime(2023, 1, 1, 0, 0, 0)
end_date = all_geo_df['event_time'].max()

union_geohash_4_df = pd.DataFrame()
# union_geohash_5_df = pd.DataFrame()
# union_geohash_6_df = pd.DataFrame()

for i in trange(((end_date - start_date).days//30 + 1)):
    end_date = start_date + relativedelta(months=1) - relativedelta(days=1)
    print(f'start: {start_date}, end: {end_date}')
    # Начальная дата за прошедшие полгода
    begin_date = end_date - relativedelta(months=6) - relativedelta(days=1)
    # Определяем только тех клиентов которые есть в сэмлере для указанной отчетной даты
    report_next_end = start_date + relativedelta(months=2) - relativedelta(days=1)
    good_slct_clients = smpl_Client_Month_df[smpl_Client_Month_df['col_report_next_end'] == report_next_end]['col_client_id'].unique()
    
    # Берем транзакйции за последний месяц и фильтруем по нужным клиентам 
    select_mon_geo_df = all_geo_df[all_geo_df['event_time'].between(start_date, end_date)]
    select_mon_geo_df = select_mon_geo_df[select_mon_geo_df['client_id'].isin(good_slct_clients)]

    # Берем транзакйции за последние полгода начиная от даты begin_date и фильтруем по нужным клиентам 
    select_ftime_geo_df = all_geo_df[all_geo_df['event_time'].between(begin_date, end_date)]
    select_ftime_geo_df = select_ftime_geo_df[select_ftime_geo_df['client_id'].isin(good_slct_clients)]
    
    print(select_mon_geo_df.shape, select_ftime_geo_df.shape)
    
    def calc_aggregate_by_geohash(select_geo_df:pd.DataFrame, bgst_time:str, geo_index:str='geohash_4',):
        geohash_df = select_geo_df.groupby(geo_index).agg(
                                        count_trx = ('client_id', len),
                                        uniq_clients = ('client_id', pd.Series.nunique),
                                              )
        geohash_df['report_next_end'] = start_date + relativedelta(months=2) - relativedelta(days=1)
        
        # Рассчитываем уровень "продоваемости продукта" относительно геопозиции
        current_mon_train_df = all_target_df[pd.to_datetime(all_target_df['report_next_end']).between(bgst_time, start_date + relativedelta(months=1) - relativedelta(days=1))]
        # current_mon_train_df = current_mon_train_df.dd_prefix('pre_mon_')
        # current_mon_train_df = train_client_by_cur_mon_df.rename(columns={'mon':'mon_report'}).add_prefix('cur_mon_')
        client_geo_df = select_geo_df[['client_id', geo_index]].drop_duplicates()
        # Формируем для пар клиент-месяц информацию о купленных продуктах
        geo_target_df = client_geo_df.merge(current_mon_train_df, 
                                                            left_on='client_id', 
                                                            right_on='client_id', 
                                                            how='left').fillna(0)
        
        # Используем только месячный client_geo_df
        # client_fulltime_geo_df = select_mon_geo_df[['client_id', geo_index]].drop_duplicates()
        # cur_fulltime_select_geo_target_df = client_fulltime_geo_df.merge(current_fulltime_train_df, left_on='client_id', right_on='pre_fulltime_client_id', how='left').fillna(0)
        # cur_fulltime_select_geo_target_df = client_geo_df.merge(current_fulltime_train_df, left_on='client_id', right_on='pre_fulltime_client_id', how='left').fillna(0)
        
        popular_product_by_geohash_df = geo_target_df.groupby(geo_index).agg(
                                        cur_sum_target_1 = ('target_1', sum),
                                        cur_sum_target_2 = ('target_2', sum),
                                        cur_sum_target_3 = ('target_3', sum),
                                        cur_sum_target_4 = ('target_4', sum),
                                        cur_uniq_clients = ('client_id', pd.Series.nunique),
        )
        # Расчитываем значение "популярности" геохешей относительно кол-ва клиентов
        columns = ['cur_sum_target_1', 'cur_sum_target_2', 'cur_sum_target_3', 'cur_sum_target_4', ]
        for col in columns:
            popular_product_by_geohash_df[f'{col}__by_clients'] = popular_product_by_geohash_df[col] / popular_product_by_geohash_df['cur_uniq_clients']
            
        geohash_df = geohash_df.reset_index()
        geohash_df = geohash_df.merge(popular_product_by_geohash_df.reset_index(), on=geo_index, how='left')
        return geohash_df
    # расчитываем популярыне продукты для геохешей в определенный месяц только для geohash_4
    premon_geohash_4_df = calc_aggregate_by_geohash(select_geo_df=select_mon_geo_df, bgst_time=start_date, geo_index='geohash_4').fillna(0)
    premon_geohash_4_df = premon_geohash_4_df.set_index(['geohash_4', 'report_next_end']).add_prefix('pre_mon_')
    preftime_geohash_4_df = calc_aggregate_by_geohash(select_geo_df=select_ftime_geo_df, bgst_time=begin_date, geo_index='geohash_4').fillna(0)
    preftime_geohash_4_df = preftime_geohash_4_df.set_index(['geohash_4', 'report_next_end']).add_prefix('pre_ftime_')
    preftime_geohash_4_df = preftime_geohash_4_df.merge(premon_geohash_4_df,
                                                    left_index=True,
                                                    right_index=True,
                                                    how='left',
                                                   ).fillna(0)
    union_geohash_4_df = pd.concat([union_geohash_4_df, preftime_geohash_4_df])

#     geohash_5_df = calc_aggregate_by_geohash(geo_index='geohash_5').fillna(0)
#     union_geohash_5_df = pd.concat([union_geohash_5_df, geohash_5_df])

#     geohash_6_df = calc_aggregate_by_geohash(geo_index='geohash_6').fillna(0)
#     union_geohash_6_df = pd.concat([union_geohash_6_df, geohash_6_df])
    start_date = start_date + relativedelta(months=1)
    
union_geohash_4_df.shape # , union_geohash_5_df.shape, union_geohash_6_df.shape

  0%|          | 0/13 [00:00<?, ?it/s]

start: 2022-01-01 00:00:00, end: 2022-01-31 00:00:00
(2220126, 5) (2221685, 5)


  8%|▊         | 1/13 [00:19<03:53, 19.43s/it]

start: 2022-02-01 00:00:00, end: 2022-02-28 00:00:00
(3232696, 5) (5979563, 5)


 15%|█▌        | 2/13 [00:39<03:38, 19.90s/it]

start: 2022-03-01 00:00:00, end: 2022-03-31 00:00:00
(3829433, 5) (9880770, 5)


 23%|██▎       | 3/13 [01:04<03:42, 22.23s/it]

start: 2022-04-01 00:00:00, end: 2022-04-30 00:00:00
(2865996, 5) (11318771, 5)


 31%|███       | 4/13 [01:32<03:39, 24.37s/it]

start: 2022-05-01 00:00:00, end: 2022-05-31 00:00:00
(3004419, 5) (14428562, 5)


 38%|███▊      | 5/13 [02:04<03:37, 27.13s/it]

start: 2022-06-01 00:00:00, end: 2022-06-30 00:00:00
(2844441, 5) (16568041, 5)


 46%|████▌     | 6/13 [02:54<04:05, 35.03s/it]

start: 2022-07-01 00:00:00, end: 2022-07-31 00:00:00
(3166611, 5) (18457727, 5)


 54%|█████▍    | 7/13 [03:32<03:35, 35.85s/it]

start: 2022-08-01 00:00:00, end: 2022-08-31 00:00:00
(3037408, 5) (17855414, 5)


 62%|██████▏   | 8/13 [04:23<03:24, 40.85s/it]

start: 2022-09-01 00:00:00, end: 2022-09-30 00:00:00
(2303264, 5) (16625114, 5)


 69%|██████▉   | 9/13 [05:09<02:49, 42.30s/it]

start: 2022-10-01 00:00:00, end: 2022-10-31 00:00:00
(3850217, 5) (27842802, 5)


 77%|███████▋  | 10/13 [05:58<02:13, 44.55s/it]

start: 2022-11-01 00:00:00, end: 2022-11-30 00:00:00
(1944024, 5) (14849702, 5)


 85%|████████▍ | 11/13 [06:39<01:26, 43.24s/it]

start: 2022-12-01 00:00:00, end: 2022-12-31 00:00:00
(792749, 5) (23373631, 5)


 92%|█████████▏| 12/13 [07:16<00:41, 41.36s/it]

start: 2023-01-01 00:00:00, end: 2023-01-31 00:00:00
(0, 5) (0, 5)


100%|██████████| 13/13 [07:30<00:00, 34.68s/it]

CPU times: total: 7min 30s
Wall time: 7min 30s





(138849, 22)

In [75]:
# union_geohash_4_df[['pre_ftime_report_next_end', 'pre_mon_report_next_end']]# [union_geohash_4_df.index == 3]
# union_geohash_4_df.info()

In [None]:
# union_geohash_4_df = compression_df(union_geohash_4_df, 
#                             datetime_cols=['report_end' ,'report_next_end', 'event_time'],)
# union_geohash_4_df.shape

In [157]:
%%time
# Сохраняем популярность таргетов в гео данных в файлы 
union_geohash_4_df.to_parquet(PATH_DATASET_OUTPUT + 'popular_geohash_4_df_13_06_2024.parquet')

CPU times: total: 141 ms
Wall time: 150 ms


In [18]:
# # Загружаем ранее сохраненные данные: популярность таргетов в гео данных в файлы 
# union_geohash_4_df = pq.read_table(PATH_DATASET_OUTPUT + f'popular_geohash_4_df_13_06_2024.parquet').to_pandas()
# union_geohash_4_df.shape

In [19]:
# (end_date - start_date).days//30 + 1
# (datetime(2023, 1, 1, 0, 0, 0) - datetime(2022, 1, 1, 0, 0, 0)).days//30 + 1

# # start_date

In [27]:
%%time
# Популярные геохеши у клиентов
# Расчет топ-5 популярных хешей для каждого клиента, и также расчет процента посещения этих топ-5 относительно всех посещаемых геохешей (по аналогии с софтмакс)
start_date = datetime(2022, 1, 1, 0, 0, 0)
# end_date = datetime(2023, 1, 1, 0, 0, 0)
end_date = all_geo_df['event_time'].max()

union_geo_by_clients_df = pd.DataFrame()

for i in trange(((end_date - start_date).days//30 + 1)):
    end_date = start_date + relativedelta(months=1) - relativedelta(days=1)
    print(f'start: {start_date}, end: {end_date}')
    # Начальная дата за прошедшие полгода
    begin_date = end_date - relativedelta(months=6) - relativedelta(days=1)
    
    # Определяем только тех клиентов которые есть в сэмлере для указанной отчетной даты
    report_next_end = start_date + relativedelta(months=2) - relativedelta(days=1)
    good_slct_clients = smpl_Client_Month_df[smpl_Client_Month_df['col_report_next_end'] == report_next_end]['col_client_id'].unique()
    
    # Берем транзакйции за последний месяц и фильтруем по нужным клиентам 
    select_mon_geo_df = all_geo_df[all_geo_df['event_time'].between(start_date, end_date)]
    select_mon_geo_df = select_mon_geo_df[select_mon_geo_df['client_id'].isin(good_slct_clients)]

    # Берем транзакйции за последние полгода начиная от даты begin_date и фильтруем по нужным клиентам 
    select_ftime_geo_df = all_geo_df[all_geo_df['event_time'].between(begin_date, end_date)]
    select_ftime_geo_df = select_ftime_geo_df[select_ftime_geo_df['client_id'].isin(good_slct_clients)]
    
    print(select_mon_geo_df.shape, select_ftime_geo_df.shape)
    if len(select_ftime_geo_df) == 0:
        start_date = start_date + relativedelta(months=1)
        continue
    
    # Создаем массив с клиентами по уникальным клиентам за весь период а не только за месяц, т.к. могло не быть транзакцй за последний месяц
    client_agg_df = pd.DataFrame(select_ftime_geo_df['client_id'].unique(), columns=['client_id'])
    client_agg_df['report_next_end'] = report_next_end
    
    def calc_aggregate_client_by_geohash(select_geo_df:pd.DataFrame, bgst_time:str):
        # считаем самые кол-во посещенных геохешей и самые популярныегеохеши
        stat_geohash_df = select_geo_df.groupby(['client_id']).agg(
            cnt_geohash_4 = ('geohash_4', len),
            sum_geohash_4 = ('geohash_4', sum),
            mode_geohash_4 = ('geohash_4', lambda s: mode(s)[0][0]), 
            nunique_geohash_4 = ('geohash_4', pd.Series.nunique), 
             
            cnt_geohash_5 = ('geohash_5', len),
            sum_geohash_5 = ('geohash_5', sum),
            mode_geohash_5 = ('geohash_5', lambda s: mode(s)[0][0]), 
            nunique_geohash_5 = ('geohash_5', pd.Series.nunique), 
            
            cnt_geohash_6 = ('geohash_6', len),
            sum_geohash_6 = ('geohash_6', sum),
            mode_geohash_6 = ('geohash_6', lambda s: mode(s)[0][0]), 
            nunique_geohash_6 = ('geohash_6', pd.Series.nunique), 
         )        
        stat_geohash_df = stat_geohash_df.reset_index()
        stat_geohash_df['report_next_end'] = report_next_end
        
        # считаем кол-во посещений популярного геохеша и процент относительно посещений всех геохешей
        # geohash_4 кол-во посещений по популярным геохешам
        for geo_index in ['geohash_4', 'geohash_5', 'geohash_6']:
            stat_geohash_df = stat_geohash_df.merge(
                select_geo_df.groupby(['client_id', geo_index]).agg(cnt_mode_geo_agg = ('event_time', len)).reset_index(),
                left_on=['client_id', f'mode_{geo_index}'],
                right_on=['client_id', geo_index],
                how='left',
            ).drop(columns=geo_index,errors='ignore').rename(columns={'cnt_mode_geo_agg': f'cnt_mode_{geo_index}'})
            stat_geohash_df[f'prc_visit_{geo_index}'] = stat_geohash_df[f'cnt_mode_{geo_index}'] / stat_geohash_df[f'cnt_{geo_index}']
        return stat_geohash_df.fillna(0)

    pop_mon_stat_geohash_df = calc_aggregate_client_by_geohash(select_geo_df=select_mon_geo_df, 
                                                                   bgst_time=start_date, 
                                                                  ).fillna(0)
    pop_mon_stat_geohash_df = pop_mon_stat_geohash_df.set_index(['client_id', 'report_next_end'])
    
    pop_ftime_stat_geohash_df = calc_aggregate_client_by_geohash(select_geo_df=select_ftime_geo_df, 
                                                                    bgst_time=begin_date, 
                                                                   ).fillna(0)
    pop_ftime_stat_geohash_df = pop_ftime_stat_geohash_df.set_index(['client_id', 'report_next_end']).add_prefix('ftime_geo_')
    
    # Объединяем фичи по месяцам и за полгода
    pop_ftime_stat_geohash_df = pop_ftime_stat_geohash_df.merge(pop_mon_stat_geohash_df, left_index=True, right_index=True, how='left')
    
    # Сводим в единый датафрейм
    union_geo_by_clients_df = pd.concat([union_geo_by_clients_df, pop_ftime_stat_geohash_df])
    
    start_date = start_date + relativedelta(months=1)
    
union_geo_by_clients_df.shape

  0%|          | 0/12 [00:00<?, ?it/s]

start: 2022-01-01 00:00:00, end: 2022-01-31 00:00:00
(2220126, 5) (2221685, 5)


  8%|▊         | 1/12 [00:52<09:35, 52.29s/it]

start: 2022-02-01 00:00:00, end: 2022-02-28 00:00:00
(3232696, 5) (5979563, 5)


 17%|█▋        | 2/12 [02:01<10:23, 62.32s/it]

start: 2022-03-01 00:00:00, end: 2022-03-31 00:00:00
(3829433, 5) (9880770, 5)


 25%|██▌       | 3/12 [03:23<10:40, 71.12s/it]

start: 2022-04-01 00:00:00, end: 2022-04-30 00:00:00
(2865996, 5) (11318771, 5)


 33%|███▎      | 4/12 [04:43<09:57, 74.66s/it]

start: 2022-05-01 00:00:00, end: 2022-05-31 00:00:00
(3004419, 5) (14428562, 5)


 42%|████▏     | 5/12 [06:13<09:20, 80.13s/it]

start: 2022-06-01 00:00:00, end: 2022-06-30 00:00:00
(2844441, 5) (16568041, 5)


 50%|█████     | 6/12 [08:03<09:01, 90.24s/it]

start: 2022-07-01 00:00:00, end: 2022-07-31 00:00:00
(3166611, 5) (18457727, 5)


 58%|█████▊    | 7/12 [09:51<08:00, 96.04s/it]

start: 2022-08-01 00:00:00, end: 2022-08-31 00:00:00
(3037408, 5) (17855414, 5)


 67%|██████▋   | 8/12 [11:47<06:50, 102.60s/it]

start: 2022-09-01 00:00:00, end: 2022-09-30 00:00:00
(2303264, 5) (16625114, 5)


 75%|███████▌  | 9/12 [13:33<05:10, 103.49s/it]

start: 2022-10-01 00:00:00, end: 2022-10-31 00:00:00
(3850217, 5) (27842802, 5)


 83%|████████▎ | 10/12 [16:07<03:58, 119.27s/it]

start: 2022-11-01 00:00:00, end: 2022-11-30 00:00:00
(1944024, 5) (14849702, 5)


 92%|█████████▏| 11/12 [17:42<01:51, 111.90s/it]

start: 2022-12-01 00:00:00, end: 2022-12-31 00:00:00
(792749, 5) (23373631, 5)


100%|██████████| 12/12 [19:43<00:00, 98.64s/it] 

CPU times: total: 19min 43s
Wall time: 19min 44s





(435751, 36)

In [33]:
# сводим сведения о популярном полигоне 4 с данными о статистике таргетов относительно полигонов
union_geo_by_clients_df = union_geo_by_clients_df.reset_index().merge(
    union_geohash_4_df.reset_index().rename(columns={'geohash_4': 'mode_geohash_4'}),
    left_on=['report_next_end', 'mode_geohash_4'],
    right_on=['report_next_end', 'mode_geohash_4'],
    how='left',
)
union_geo_by_clients_df = union_geo_by_clients_df.set_index(['client_id', 'report_next_end'])
union_geo_by_clients_df.shape

(435751, 58)

In [34]:
%%time
# Сохраняем популярность таргетов в гео данных в файлы 
union_geo_by_clients_df.to_parquet(PATH_DATASET_OUTPUT + 'geo_by_clients_df_13_06_2024.parquet')

CPU times: total: 1.36 s
Wall time: 993 ms


# Подвал

In [118]:
%%time
tmp_df = stat_geohash_df.reset_index().merge(
    select_mon_geo_df.groupby(['client_id', 'geohash_4']).agg(cnt_mode_geo_4 = ('event_time', len)).reset_index(),
    left_on=['client_id', 'mode_geohash_4'],
    right_on=['client_id', 'geohash_4'],
    how='left',
)
tmp_df

CPU times: total: 1.52 s
Wall time: 1.51 s


Unnamed: 0,client_id,cnt_geohash_4,sum_geohash_4,mode_geohash_4,nunique_geohash_4,cnt_geohash_5,sum_geohash_5,mode_geohash_5,nunique_geohash_5,cnt_geohash_6,sum_geohash_6,mode_geohash_6,nunique_geohash_6,geohash_4,cnt_mode_geo_4
0,00019eed9ed218e2c59d668c46d3ff841fee03ab225b20...,40,391009,208,6,40,6520568,40507,12,40,46198495,165362,14,208,13
1,0001dba6db98c85f40b1c222d4c0ffc7a3a0194b98ee02...,36,1511314,45879,4,36,11167199,355866,7,36,46589593,873784,17,45879,13
2,00026372ff1da433dccb75b7eeb25e363175b84d5bb9e2...,118,635537,453,6,118,12305049,57498,9,118,106743664,495381,17,453,75
3,0003d906f12e8e6ced0182f61c1baf72b1f752ddbd1bb1...,46,1003242,21062,8,46,8590935,349208,17,46,81249690,2574609,23,21062,35
4,0003eb73b92a696ab1cf020e751470e68e9a8fa690c71a...,76,748944,7427,5,76,9696502,140021,9,76,124701295,2402257,14,7427,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29359,fff359fd59220023c1127267b8b2b15142d97caa70322b...,35,329671,5243,4,35,6376804,156356,7,35,31602047,453010,9,5243,18
29360,fff4d981e02573eabc0ed9a5c67af62289cf4734db21df...,24,430771,17663,4,24,2193049,24211,6,24,37353115,1790488,7,17663,17
29361,fff66eddaeebb430a743379b325c596de674657b56be1a...,147,1772625,9475,2,147,42854940,353142,6,147,265721069,2050750,18,9475,97
29362,fff9ca251a51dd97419ce3e1561310c1807fc235a5d55b...,235,3689281,5169,12,235,37069061,103821,30,235,303450948,671856,67,5169,149


In [130]:
# tmp_df
# tmp_df_2['client_id'][:1].item()

# tmp_df[tmp_df['client_id'] == '00019eed9ed218e2c59d668c46d3ff841fee03ab225b20060c88cdc4776ed70e']

In [131]:
# tmp_df_2[tmp_df_2['client_id'] == '00019eed9ed218e2c59d668c46d3ff841fee03ab225b20060c88cdc4776ed70e']

In [119]:
tmp_df_2 = select_mon_geo_df.groupby(['client_id', 'geohash_4']).agg(cnt_mode_geo_4 = ('event_time', len)).reset_index()
tmp_df_2

Unnamed: 0,client_id,geohash_4,cnt_mode_geo_4
0,00019eed9ed218e2c59d668c46d3ff841fee03ab225b20...,208,13
1,00019eed9ed218e2c59d668c46d3ff841fee03ab225b20...,3531,10
2,00019eed9ed218e2c59d668c46d3ff841fee03ab225b20...,8218,1
3,00019eed9ed218e2c59d668c46d3ff841fee03ab225b20...,17663,13
4,00019eed9ed218e2c59d668c46d3ff841fee03ab225b20...,21434,1
...,...,...,...
107186,fff9ca251a51dd97419ce3e1561310c1807fc235a5d55b...,36866,5
107187,fff9ca251a51dd97419ce3e1561310c1807fc235a5d55b...,45313,2
107188,fffb420c5d863f2d1c6aacde923b8b42730f951957953b...,17616,1
107189,fffb420c5d863f2d1c6aacde923b8b42730f951957953b...,31732,5


In [106]:
# select_mon_geo_df
# stat_geohash_df.reset_index().groupby(['client_id', 'mode_geohash_4']).agg(len)
stat_geohash_df.reset_index().set_index(['client_id', 'mode_geohash_4'])

# stat_geohash_df

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_geohash_4,sum_geohash_4,nunique_geohash_4,cnt_geohash_5,sum_geohash_5,mode_geohash_5,nunique_geohash_5,cnt_geohash_6,sum_geohash_6,mode_geohash_6,nunique_geohash_6
client_id,mode_geohash_4,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00019eed9ed218e2c59d668c46d3ff841fee03ab225b20060c88cdc4776ed70e,208,40,391009,6,40,6520568,40507,12,40,46198495,165362,14
0001dba6db98c85f40b1c222d4c0ffc7a3a0194b98ee02ab50c88d963b5514f4,45879,36,1511314,4,36,11167199,355866,7,36,46589593,873784,17
00026372ff1da433dccb75b7eeb25e363175b84d5bb9e21ac36fae722128f478,453,118,635537,6,118,12305049,57498,9,118,106743664,495381,17
0003d906f12e8e6ced0182f61c1baf72b1f752ddbd1bb101437b68c699da5298,21062,46,1003242,8,46,8590935,349208,17,46,81249690,2574609,23
0003eb73b92a696ab1cf020e751470e68e9a8fa690c71a865c9fc6a3041a759c,7427,76,748944,5,76,9696502,140021,9,76,124701295,2402257,14
...,...,...,...,...,...,...,...,...,...,...,...,...
fff359fd59220023c1127267b8b2b15142d97caa70322b23198787791695000a,5243,35,329671,4,35,6376804,156356,7,35,31602047,453010,9
fff4d981e02573eabc0ed9a5c67af62289cf4734db21df76d5b54639eaec7230,17663,24,430771,4,24,2193049,24211,6,24,37353115,1790488,7
fff66eddaeebb430a743379b325c596de674657b56be1a60a0b66ce512e9a649,9475,147,1772625,2,147,42854940,353142,6,147,265721069,2050750,18
fff9ca251a51dd97419ce3e1561310c1807fc235a5d55bff5df5b79283255b36,5169,235,3689281,12,235,37069061,103821,30,235,303450948,671856,67


In [97]:
stat_geohash_df.reset_index().set_index(['client_id', 'mode_geohash_4'])

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_geohash_4,sum_geohash_4,nunique_geohash_4,cnt_geohash_5,sum_geohash_5,mode_geohash_5,nunique_geohash_5,cnt_geohash_6,sum_geohash_6,mode_geohash_6,nunique_geohash_6
client_id,mode_geohash_4,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00019eed9ed218e2c59d668c46d3ff841fee03ab225b20060c88cdc4776ed70e,208,40,391009,6,40,6520568,40507,12,40,46198495,165362,14
0001dba6db98c85f40b1c222d4c0ffc7a3a0194b98ee02ab50c88d963b5514f4,45879,36,1511314,4,36,11167199,355866,7,36,46589593,873784,17
00026372ff1da433dccb75b7eeb25e363175b84d5bb9e21ac36fae722128f478,453,118,635537,6,118,12305049,57498,9,118,106743664,495381,17
0003d906f12e8e6ced0182f61c1baf72b1f752ddbd1bb101437b68c699da5298,21062,46,1003242,8,46,8590935,349208,17,46,81249690,2574609,23
0003eb73b92a696ab1cf020e751470e68e9a8fa690c71a865c9fc6a3041a759c,7427,76,748944,5,76,9696502,140021,9,76,124701295,2402257,14
...,...,...,...,...,...,...,...,...,...,...,...,...
fff359fd59220023c1127267b8b2b15142d97caa70322b23198787791695000a,5243,35,329671,4,35,6376804,156356,7,35,31602047,453010,9
fff4d981e02573eabc0ed9a5c67af62289cf4734db21df76d5b54639eaec7230,17663,24,430771,4,24,2193049,24211,6,24,37353115,1790488,7
fff66eddaeebb430a743379b325c596de674657b56be1a60a0b66ce512e9a649,9475,147,1772625,2,147,42854940,353142,6,147,265721069,2050750,18
fff9ca251a51dd97419ce3e1561310c1807fc235a5d55bff5df5b79283255b36,5169,235,3689281,12,235,37069061,103821,30,235,303450948,671856,67


In [66]:
# %%time
# # Популярные геохеши у клиентов
# # Расчет топ-5 популярных хешей для каждого клиента, и также расчет процента посещения этих топ-5 относительно всех посещаемых геохешей (по аналогии с софтмакс)
# begin_date = datetime(2021, 11, 1, 0, 0, 0)
# start_date = datetime(2022, 1, 1, 0, 0, 0)
# end_date = datetime(2023, 2, 1, 0, 0, 0)

# union_geo_by_clients_4_df = pd.DataFrame()
# # union_geo_by_clients_5_df = pd.DataFrame()
# # union_geo_by_clients_6_df = pd.DataFrame()

# clients_top_geohash_4_df = pd.DataFrame()
# # clients_top_geohash_5_df = pd.DataFrame()
# # clients_top_geohash_6_df = pd.DataFrame()


# for i in trange(13):
#     end_date = start_date + relativedelta(months=1) - relativedelta(days=1)
#     print(f'start: {start_date}, end: {end_date}')
    
#     select_mon_geo_df = all_geo_df[all_geo_df['event_time'].between(start_date, end_date)]
#     select_fulltime_geo_df = all_geo_df[all_geo_df['event_time'].between(end_date - relativedelta(months=6) - relativedelta(days=1), end_date)]
#     # select_fulltime_geo_df = all_geo_df[all_geo_df['event_time'].between(begin_date, end_date)]
#     print(select_mon_geo_df.shape, select_fulltime_geo_df.shape)
    
#     def calc_aggregate_client_by_geohash(select_time_geo_df:pd.DataFrame, geo_index:str='geohash_4'):
#         # Используем только месячный client_geo_df
#         client_geo_df = select_time_geo_df.groupby(['client_id',geo_index]).size().reset_index(name='freq')
        
#         geohash_df = client_geo_df.groupby(['client_id']).agg(
#             cnt_geo = ('freq', len),
#             sum_geo_trx = ('freq', sum),
#         )

#         geo_by_clients = geohash_df.reset_index().merge(client_geo_df, on='client_id', how='left')
#         geo_by_clients['prc_use'] = geo_by_clients['freq'] / geo_by_clients['sum_geo_trx']

#         # geo_by_clients['report_end'] = end_date
#         geo_by_clients['report_next_end'] = start_date + relativedelta(months=2) - relativedelta(days=1)
        
#         popular_geo_by_clients = geo_by_clients.loc[geo_by_clients.groupby('client_id')['prc_use'].transform('max').eq(geo_by_clients['prc_use'])]
#         # Могут быть ситуации когда одинаковые геохеши имеют одинаковый вес, тогда выбираем первый
#         popular_geo_by_clients = popular_geo_by_clients.drop_duplicates(subset=['client_id', 'prc_use'])
        
#         # popular_geo_by_clients['report_end'] = end_date
#         popular_geo_by_clients['report_next_end'] = start_date + relativedelta(months=2) - relativedelta(days=1)
        
#         # return geo_by_clients.fillna(0), popular_geo_by_clients.fillna(0)
#         return popular_geo_by_clients.fillna(0)
    
#     # Расчет для 4 гео-индекса
#     pop_geo_by_clients_4_df = calc_aggregate_client_by_geohash(
#                                                             select_time_geo_df=select_mon_geo_df,
#                                                             geo_index='geohash_4')
#     pop_geo_by_clients_ftime_4_df = calc_aggregate_client_by_geohash(
#                                                             select_time_geo_df=select_fulltime_geo_df,
#                                                             geo_index='geohash_4')
    
#     pop_geo_by_clients_4_df = pop_geo_by_clients_4_df.set_index(['client_id', 'report_next_end'])
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.set_index(['client_id', 'report_next_end']).add_suffix('_ftime')
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.merge(pop_geo_by_clients_4_df, left_index=True, right_index=True, how='left')
#     del pop_geo_by_clients_4_df
#     gc.collect()
    
#     # добавляем сведения о самом популярном геохеше
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.reset_index().set_index(['geohash_4', 'report_next_end']).merge(
#                                                             union_geohash_4_df, 
#                                                             left_index=True,
#                                                             right_index=True,
#                                                             how='left',)
    
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.reset_index().set_index(['geohash_4_ftime', 'report_next_end'])
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.reset_index().rename(columns={'geohash_4': 'curmon_geohash_4', 'geohash_4_ftime': 'geohash_4'}).set_index(['geohash_4', 'report_next_end'])
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.merge(
#                                                             union_geohash_4_df, #.add_suffix('_ftime_ind'), 
#                                                             left_index=True,
#                                                             right_index=True,
#                                                             how='left',)
    
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.reset_index().set_index(['client_id', 'report_next_end'])
#     pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.fillna(pop_geo_by_clients_ftime_4_df.median())
#     clients_top_geohash_4_df = pd.concat([clients_top_geohash_4_df, pop_geo_by_clients_ftime_4_df])
#     del pop_geo_by_clients_ftime_4_df
#     gc.collect()
#     clients_top_geohash_4_df = compression_df(clients_top_geohash_4_df, 
#                             datetime_cols=['report_end' ,'report_next_end', 'event_time'],)

# #     print(4)    
# #     # Такой же расчет для 5 гео-индекса
# #     pop_geo_by_clients_5_df = calc_aggregate_client_by_geohash(
# #                                                             select_time_geo_df=select_mon_geo_df,
# #                                                             geo_index='geohash_5')
# #     pop_geo_by_clients_ftime_5_df = calc_aggregate_client_by_geohash(
# #                                                             select_time_geo_df=select_fulltime_geo_df,
# #                                                             geo_index='geohash_5')    
# #     print(4)    
# #     pop_geo_by_clients_5_df = pop_geo_by_clients_5_df.set_index(['client_id', 'report_next_end'])
# #     pop_geo_by_clients_ftime_5_df = pop_geo_by_clients_ftime_5_df.set_index(['client_id', 'report_next_end']).add_suffix('_ftime')
# #     pop_geo_by_clients_ftime_5_df = pop_geo_by_clients_ftime_5_df.merge(pop_geo_by_clients_5_df, left_index=True, right_index=True, how='left')
# #     print(5)    
# #     # добавляем сведения о самом популярном геохеше
# #     pop_geo_by_clients_ftime_5_df = pop_geo_by_clients_ftime_5_df.reset_index().set_index(['geohash_5', 'report_next_end']).merge(
# #                                                             union_geohash_5_df.set_index(['geohash_5', 'report_next_end']), 
# #                                                             left_index=True,
# #                                                             right_index=True,
# #                                                             how='left',)
# #     print(6)    
# #     pop_geo_by_clients_ftime_5_df = pop_geo_by_clients_ftime_5_df.reset_index().set_index(['geohash_5_ftime', 'report_next_end']).merge(
# #                                                             union_geohash_5_df.set_index(['geohash_5', 'report_next_end']).add_suffix('_ftime_ind'), 
# #                                                             left_index=True,
# #                                                             right_index=True,
# #                                                             how='left',)
# #     pop_geo_by_clients_ftime_5_df = pop_geo_by_clients_ftime_5_df.reset_index().set_index(['client_id', 'report_next_end'])
# #     pop_geo_by_clients_ftime_5_df = pop_geo_by_clients_ftime_5_df.fillna(pop_geo_by_clients_ftime_5_df.median())
    

# #     clients_top_geohash_5_df = pd.concat([clients_top_geohash_5_df, pop_geo_by_clients_ftime_5_df])
# #     del pop_geo_by_clients_ftime_5_df
# #     gc.collect()

#     # clients_top_geohash_4_df = pd.concat([clients_top_geohash_4_df, pop_geo_by_clients_4_df])
# #     break
# #     geo_by_clients_4_df, pop_geo_by_clients_4_df = calc_aggregate_client_by_geohash(geo_index='geohash_4')
# #     union_geo_by_clients_4_df = pd.concat([union_geo_by_clients_4_df, geo_by_clients_4_df])
# #     clients_top_geohash_4_df = pd.concat([clients_top_geohash_4_df, pop_geo_by_clients_4_df])

# #     geo_by_clients_5_df, pop_geo_by_clients_5_df = calc_aggregate_client_by_geohash(geo_index='geohash_5')
# #     union_geo_by_clients_5_df = pd.concat([union_geo_by_clients_5_df, geo_by_clients_5_df])
# #     clients_top_geohash_5_df = pd.concat([clients_top_geohash_5_df, pop_geo_by_clients_5_df])

# #     geo_by_clients_6_df, pop_geo_by_clients_6_df = calc_aggregate_client_by_geohash(geo_index='geohash_6')
# #     union_geo_by_clients_6_df = pd.concat([union_geo_by_clients_6_df, geo_by_clients_6_df])
# #     clients_top_geohash_6_df = pd.concat([clients_top_geohash_6_df, pop_geo_by_clients_6_df])

#     start_date = start_date + relativedelta(months=1)
    
# clients_top_geohash_4_df.shape#, clients_top_geohash_5_df.shape

  0%|          | 0/13 [00:00<?, ?it/s]

start: 2022-01-01 00:00:00, end: 2022-01-31 00:00:00
(46236373, 3) (46269676, 3)


  8%|▊         | 1/13 [00:45<09:05, 45.46s/it]

start: 2022-02-01 00:00:00, end: 2022-02-28 00:00:00
(56463257, 3) (104711397, 3)


 15%|█▌        | 2/13 [01:44<09:47, 53.44s/it]

start: 2022-03-01 00:00:00, end: 2022-03-31 00:00:00
(67114247, 3) (174037792, 3)


 23%|██▎       | 3/13 [02:57<10:25, 62.53s/it]

start: 2022-04-01 00:00:00, end: 2022-04-30 00:00:00
(59103493, 3) (235255358, 3)


 31%|███       | 4/13 [04:23<10:46, 71.85s/it]

start: 2022-05-01 00:00:00, end: 2022-05-31 00:00:00
(61546843, 3) (298790948, 3)


 38%|███▊      | 5/13 [06:02<10:51, 81.40s/it]

start: 2022-06-01 00:00:00, end: 2022-06-30 00:00:00
(61781089, 3) (362706678, 3)


 46%|████▌     | 6/13 [07:53<10:40, 91.47s/it]

start: 2022-07-01 00:00:00, end: 2022-07-31 00:00:00
(65913203, 3) (386470162, 3)


 54%|█████▍    | 7/13 [09:50<09:58, 99.76s/it]

start: 2022-08-01 00:00:00, end: 2022-08-31 00:00:00
(67921454, 3) (398466561, 3)


 62%|██████▏   | 8/13 [11:46<08:44, 105.00s/it]

start: 2022-09-01 00:00:00, end: 2022-09-30 00:00:00
(53479282, 3) (386744153, 3)


 69%|██████▉   | 9/13 [13:42<07:14, 108.59s/it]

start: 2022-10-01 00:00:00, end: 2022-10-31 00:00:00
(52287223, 3) (377170775, 3)


 77%|███████▋  | 10/13 [15:36<05:30, 110.08s/it]

start: 2022-11-01 00:00:00, end: 2022-11-30 00:00:00
(44756291, 3) (362515673, 3)


 85%|████████▍ | 11/13 [17:28<03:41, 110.66s/it]

start: 2022-12-01 00:00:00, end: 2022-12-31 00:00:00
(9262981, 3) (306741299, 3)


 92%|█████████▏| 12/13 [19:00<01:45, 105.00s/it]

start: 2023-01-01 00:00:00, end: 2023-01-31 00:00:00
(0, 3) (238866865, 3)


100%|██████████| 13/13 [20:11<00:00, 93.20s/it] 

CPU times: total: 20min 8s
Wall time: 20min 11s





(9444490, 30)

In [71]:
# clients_top_geohash_4_df#['report_next_end']#.max()
# clients_top_geohash_4_df
# import gc
# gc.collect()
# clients_top_geohash_4_df.info()
# clients_top_geohash_4_df

In [72]:
%%time
clients_top_geohash_4_df.to_parquet(PATH_DATASET_OUTPUT + 'clients_top_geohash_4_df_10_06_2024_v2.parquet')
# clients_top_geohash_4_df.info()
# clients_top_geohash_4_df.info()

CPU times: total: 7.94 s
Wall time: 8.32 s


In [43]:
# clients_top_geohash_4_df

In [44]:
# pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.reset_index().rename(columns={'geohash_4': 'curmon_geohash_4', 'geohash_4_ftime': 'geohash_4'}).set_index(['geohash_4', 'report_next_end'])
# # pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.reset_index().rename(columns={'geohash_4_ftime': 'geohash_4'})
# # .set_index(['geohash_4', 'report_next_end'])
# # pop_geo_by_clients_ftime_4_df
# # pop_geo_by_clients_ftime_4_df['geohash_4']
# pop_geo_by_clients_ftime_4_df

In [45]:
# union_geohash_4_df

In [46]:
# pop_geo_by_clients_ftime_4_df

In [47]:
# %%time
# # pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.reset_index().set_index(['geohash_4_ftime', 'report_next_end'])
# pop_geo_by_clients_ftime_4_df.merge(
#                                                         union_geohash_4_df, #.add_suffix('_ftime_ind'), 
#                                                         left_index=True,
#                                                         right_index=True,
#                                                         how='left'
# )

In [48]:
# # # clients_top_geohash_4_df
# # # geohash_4_ftime, geohash_4
# # # union_geohash_4_df.set_index(['geohash_4', 'report_next_end'])
# # # geohash_4
# # tmp_df = clients_top_geohash_4_df.reset_index().set_index(['geohash_4', 'report_next_end']).merge(
# #                                                         union_geohash_4_df.set_index(['geohash_4', 'report_next_end']), 
# #                                                         left_index=True,
# #                                                         right_index=True,
# #                                                         how='left',)
# # tmp_df = tmp_df.reset_index().set_index(['client_id', 'report_next_end'])
# # tmp_df = tmp_df.fillna(tmp_df.median())
# # tmp_df
# clients_top_geohash_4_df.columns

In [54]:
pop_geo_by_clients_4_df[['client_id', 'report_next_end']].shape, pop_geo_by_clients_4_df[['client_id', 'report_next_end']].drop_duplicates().shape

((648436, 2), (648436, 2))

In [55]:
pop_geo_by_clients_ftime_4_df[['client_id', 'report_next_end']].shape, pop_geo_by_clients_ftime_4_df[['client_id', 'report_next_end']].drop_duplicates().shape

((718850, 2), (718850, 2))

In [67]:
# tmp_pop_df = tmp_pop_df.fillna(tmp_pop_df.median())
# tmp_pop_df = tmp_pop_df.drop(columns=['report_end', 'report_end_ftime'], errors='ignore')
# tmp_pop_df = tmp_pop_df.fillna(tmp_pop_df.median())

In [58]:
pop_geo_by_clients_4_df = pop_geo_by_clients_4_df.set_index(['client_id', 'report_next_end'])
pop_geo_by_clients_ftime_4_df = pop_geo_by_clients_ftime_4_df.set_index(['client_id', 'report_next_end']).add_suffix('_ftime')
tmp_pop_df = pop_geo_by_clients_ftime_4_df.merge(pop_geo_by_clients_4_df, left_index=True, right_index=True, how='left')
tmp_pop_df = tmp_pop_df.fillna(tmp_pop_df.median())


In [64]:
tmp_pop_df = pop_geo_by_clients_ftime_4_df.merge(pop_geo_by_clients_4_df, left_index=True, right_index=True, how='left')
tmp_pop_df = tmp_pop_df.fillna(tmp_pop_df.median())
# tmp_pop_df

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_geo_ftime,sum_geo_trx_ftime,geohash_4_ftime,freq_ftime,prc_use_ftime,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use
client_id,report_next_end,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
000006265d27d1166ed67506682be7380007a5bead4362f0a9795f7d97fb08e3,2022-05-31,2,6,36164,5,0.833333,,,,,
00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3c8489fab384184d5ee,2022-05-31,9,535,3531,396,0.740187,3.0,148.0,3531.0,132.0,0.891892
000030a4067420da425d21ea72d5e647d26cf279e551796ca54d5c4385f1d476,2022-05-31,1,7,28902,7,1.000000,,,,,
0001183b2178ef534e5a50004957e3bd0cdb1b270cfbf00f1db32ec939403bf4,2022-05-31,1,12,21062,12,1.000000,,,,,
00011c01bb22d8f62d9655f32d123dcca5ae55179f8266bdb8676e25321e8477,2022-05-31,5,233,45801,201,0.862661,3.0,59.0,45801.0,56.0,0.949153
...,...,...,...,...,...,...,...,...,...,...,...
ffffa99ee602d379ea65e0fbdbfb0c82ed074e28cd3adac2978cd07d26fb8bf4,2022-05-31,1,105,40338,105,1.000000,1.0,36.0,40338.0,36.0,1.000000
ffffa9af8a057b55b18af946e157391cd9f5a5fd9b61cbbdef96c26116f04379,2022-05-31,4,102,40056,46,0.450980,2.0,34.0,18951.0,21.0,0.617647
ffffd4051d83a66a3dd0047c4f3c79902a4ef5df0878efdcefc1381f9907d993,2022-05-31,1,40,1480,40,1.000000,1.0,9.0,1480.0,9.0,1.000000
ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cbe1ccc15ab6b519efb3,2022-05-31,2,8,10137,6,0.750000,2.0,2.0,10137.0,1.0,0.500000


In [None]:
pop_geo_by_clients_ftime_4_df

In [47]:
geo_by_clients_4_df

Unnamed: 0,client_id,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use,report_end,report_next_end
0,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3,148,3531,132,0.891892,2022-04-30,2022-05-31
1,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3,148,21721,13,0.087838,2022-04-30,2022-05-31
2,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3,148,46370,3,0.020270,2022-04-30,2022-05-31
3,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,3,59,9434,2,0.033898,2022-04-30,2022-05-31
4,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,3,59,25679,1,0.016949,2022-04-30,2022-05-31
...,...,...,...,...,...,...,...,...
2653321,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,13471,4,0.021858,2022-04-30,2022-05-31
2653322,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,17708,21,0.114754,2022-04-30,2022-05-31
2653323,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,18784,15,0.081967,2022-04-30,2022-05-31
2653324,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,24250,136,0.743169,2022-04-30,2022-05-31


In [48]:
geo_by_clients_ftime_4_df

Unnamed: 0,client_id,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use,report_end,report_next_end
0,000006265d27d1166ed67506682be7380007a5bead4362...,2,6,34108,1,0.166667,2022-04-30,2022-05-31
1,000006265d27d1166ed67506682be7380007a5bead4362...,2,6,36164,5,0.833333,2022-04-30,2022-05-31
2,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,9,535,3531,396,0.740187,2022-04-30,2022-05-31
3,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,9,535,17663,16,0.029907,2022-04-30,2022-05-31
4,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,9,535,21434,1,0.001869,2022-04-30,2022-05-31
...,...,...,...,...,...,...,...,...
4612368,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,13471,24,0.031373,2022-04-30,2022-05-31
4612369,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,17708,76,0.099346,2022-04-30,2022-05-31
4612370,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,18784,17,0.022222,2022-04-30,2022-05-31
4612371,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,24250,621,0.811765,2022-04-30,2022-05-31


In [28]:
# geo_by_clients_4_df, pop_geo_by_clients_4_df
# geo_by_clients_ftime_4_df, pop_geo_by_clients_ftime_4_df 
geo_by_clients_4_df

Unnamed: 0,client_id,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use,report_end,report_next_end
0,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3,148,3531,132,0.891892,2022-04-30,2022-05-31
1,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3,148,21721,13,0.087838,2022-04-30,2022-05-31
2,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3,148,46370,3,0.020270,2022-04-30,2022-05-31
3,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,3,59,9434,2,0.033898,2022-04-30,2022-05-31
4,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,3,59,25679,1,0.016949,2022-04-30,2022-05-31
...,...,...,...,...,...,...,...,...
2653321,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,13471,4,0.021858,2022-04-30,2022-05-31
2653322,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,17708,21,0.114754,2022-04-30,2022-05-31
2653323,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,18784,15,0.081967,2022-04-30,2022-05-31
2653324,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,24250,136,0.743169,2022-04-30,2022-05-31


In [49]:
geo_by_clients_4_df['client_id'][-2:-1].item()
geo_by_clients_4_df[geo_by_clients_4_df['client_id'] == 'fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300']

Unnamed: 0,client_id,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use,report_end,report_next_end
2653317,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,5038,2,0.010929,2022-04-30,2022-05-31
2653318,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,8780,1,0.005464,2022-04-30,2022-05-31
2653319,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,10002,1,0.005464,2022-04-30,2022-05-31
2653320,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,11870,1,0.005464,2022-04-30,2022-05-31
2653321,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,13471,4,0.021858,2022-04-30,2022-05-31
2653322,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,17708,21,0.114754,2022-04-30,2022-05-31
2653323,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,18784,15,0.081967,2022-04-30,2022-05-31
2653324,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,24250,136,0.743169,2022-04-30,2022-05-31
2653325,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,9,183,30040,2,0.010929,2022-04-30,2022-05-31


In [50]:
# geo_by_clients_4_df['client_id'][:1].item()
geo_by_clients_ftime_4_df[geo_by_clients_ftime_4_df['client_id'] == 'fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300']

Unnamed: 0,client_id,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use,report_end,report_next_end
4612363,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,4465,2,0.002614,2022-04-30,2022-05-31
4612364,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,5038,8,0.010458,2022-04-30,2022-05-31
4612365,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,8780,7,0.00915,2022-04-30,2022-05-31
4612366,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,10002,2,0.002614,2022-04-30,2022-05-31
4612367,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,11870,3,0.003922,2022-04-30,2022-05-31
4612368,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,13471,24,0.031373,2022-04-30,2022-05-31
4612369,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,17708,76,0.099346,2022-04-30,2022-05-31
4612370,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,18784,17,0.022222,2022-04-30,2022-05-31
4612371,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,24250,621,0.811765,2022-04-30,2022-05-31
4612372,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,10,765,30040,5,0.006536,2022-04-30,2022-05-31


In [31]:
geo_by_clients_ftime_4_df.set_index('client_id').add_suffix('_ftime')

Unnamed: 0_level_0,cnt_geo_ftime,sum_geo_trx_ftime,geohash_4_ftime,freq_ftime,prc_use_ftime,report_end_ftime,report_next_end_ftime
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000006265d27d1166ed67506682be7380007a5bead4362f0a9795f7d97fb08e3,2,6,34108,1,0.166667,2022-04-30,2022-05-31
000006265d27d1166ed67506682be7380007a5bead4362f0a9795f7d97fb08e3,2,6,36164,5,0.833333,2022-04-30,2022-05-31
00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3c8489fab384184d5ee,9,535,3531,396,0.740187,2022-04-30,2022-05-31
00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3c8489fab384184d5ee,9,535,17663,16,0.029907,2022-04-30,2022-05-31
00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3c8489fab384184d5ee,9,535,21434,1,0.001869,2022-04-30,2022-05-31
...,...,...,...,...,...,...,...
fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300,10,765,13471,24,0.031373,2022-04-30,2022-05-31
fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300,10,765,17708,76,0.099346,2022-04-30,2022-05-31
fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300,10,765,18784,17,0.022222,2022-04-30,2022-05-31
fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300,10,765,24250,621,0.811765,2022-04-30,2022-05-31


In [110]:
# %%time
# # Популярные геохеши у клиентов
# # Расчет топ-5 популярных хешей для каждого клиента, и также расчет процента посещения этих топ-5 относительно всех посещаемых геохешей (по аналогии с софтмакс)
# start_date = datetime(2022, 1, 1, 0, 0, 0)
# end_date = datetime(2023, 1, 1, 0, 0, 0)

# union_geo_by_clients_4_df = pd.DataFrame()
# union_geo_by_clients_5_df = pd.DataFrame()
# union_geo_by_clients_6_df = pd.DataFrame()

# clients_top_geohash_4_df = pd.DataFrame()
# clients_top_geohash_5_df = pd.DataFrame()
# clients_top_geohash_6_df = pd.DataFrame()


# for i in trange(13):
#     end_date = start_date + relativedelta(months=1) - relativedelta(days=1)
#     print(f'start: {start_date}, end: {end_date}')
    
#     select_mon_geo_df = all_geo_df[all_geo_df['event_time'].between(start_date, end_date)]
#     print(select_mon_geo_df.shape)
    
#     def calc_aggregate_client_by_geohash(geo_index:str='geohash_4'):
#         client_geo_df = select_mon_geo_df.groupby(['client_id',geo_index]).size().reset_index(name='freq')

#         geohash_df = client_geo_df.groupby(['client_id']).agg(
#             cnt_geo = ('freq', len),
#             sum_geo_trx = ('freq', sum),
#         )

#         geo_by_clients = geohash_df.reset_index().merge(client_geo_df, on='client_id', how='left')
#         geo_by_clients['prc_use'] = geo_by_clients['freq'] / geo_by_clients['sum_geo_trx']

#         geo_by_clients['report_end'] = end_date
#         geo_by_clients['report_next_end'] = start_date + relativedelta(months=2) - relativedelta(days=1)
        
#         popular_geo_by_clients = geo_by_clients.loc[geo_by_clients.groupby('client_id')['prc_use'].transform('max').eq(geo_by_clients['prc_use'])]
#         # Могут быть ситуации когда одинаковые геохеши имеют одинаковый вес, тогда выбираем первый
#         popular_geo_by_clients = popular_geo_by_clients.drop_duplicates(subset=['client_id', 'prc_use'])
        
#         popular_geo_by_clients['report_end'] = end_date
#         popular_geo_by_clients['report_next_end'] = start_date + relativedelta(months=2) - relativedelta(days=1)
        
#         return geo_by_clients.fillna(0), popular_geo_by_clients.fillna(0)
    
#     geo_by_clients_4_df, pop_geo_by_clients_4_df = calc_aggregate_client_by_geohash(geo_index='geohash_4')
#     union_geo_by_clients_4_df = pd.concat([union_geo_by_clients_4_df, geo_by_clients_4_df])
#     clients_top_geohash_4_df = pd.concat([clients_top_geohash_4_df, pop_geo_by_clients_4_df])

#     geo_by_clients_5_df, pop_geo_by_clients_5_df = calc_aggregate_client_by_geohash(geo_index='geohash_5')
#     union_geo_by_clients_5_df = pd.concat([union_geo_by_clients_5_df, geo_by_clients_5_df])
#     clients_top_geohash_5_df = pd.concat([clients_top_geohash_5_df, pop_geo_by_clients_5_df])

#     geo_by_clients_6_df, pop_geo_by_clients_6_df = calc_aggregate_client_by_geohash(geo_index='geohash_6')
#     union_geo_by_clients_6_df = pd.concat([union_geo_by_clients_6_df, geo_by_clients_6_df])
#     clients_top_geohash_6_df = pd.concat([clients_top_geohash_6_df, pop_geo_by_clients_6_df])

#     start_date = start_date + relativedelta(months=1)
    
# union_geo_by_clients_4_df.shape, union_geo_by_clients_5_df.shape, union_geo_by_clients_6_df.shape, clients_top_geohash_4_df.shape, clients_top_geohash_5_df.shape, clients_top_geohash_6_df.shape, 

  0%|          | 0/13 [00:00<?, ?it/s]

start: 2022-01-01 00:00:00, end: 2022-01-31 00:00:00
(46236373, 5)


  8%|▊         | 1/13 [00:58<11:46, 58.89s/it]

start: 2022-02-01 00:00:00, end: 2022-02-28 00:00:00
(56463257, 5)


 15%|█▌        | 2/13 [02:04<11:29, 62.70s/it]

start: 2022-03-01 00:00:00, end: 2022-03-31 00:00:00
(67114247, 5)


 23%|██▎       | 3/13 [03:17<11:15, 67.57s/it]

start: 2022-04-01 00:00:00, end: 2022-04-30 00:00:00
(59103493, 5)


 31%|███       | 4/13 [04:28<10:18, 68.69s/it]

start: 2022-05-01 00:00:00, end: 2022-05-31 00:00:00
(61546843, 5)


 38%|███▊      | 5/13 [05:42<09:26, 70.87s/it]

start: 2022-06-01 00:00:00, end: 2022-06-30 00:00:00
(61781089, 5)


 46%|████▌     | 6/13 [06:59<08:29, 72.83s/it]

start: 2022-07-01 00:00:00, end: 2022-07-31 00:00:00
(65913203, 5)


 54%|█████▍    | 7/13 [08:18<07:28, 74.79s/it]

start: 2022-08-01 00:00:00, end: 2022-08-31 00:00:00
(67921454, 5)


 62%|██████▏   | 8/13 [09:39<06:24, 76.83s/it]

start: 2022-09-01 00:00:00, end: 2022-09-30 00:00:00
(53479282, 5)


 69%|██████▉   | 9/13 [10:51<05:01, 75.42s/it]

start: 2022-10-01 00:00:00, end: 2022-10-31 00:00:00
(52287223, 5)


 77%|███████▋  | 10/13 [12:07<03:46, 75.64s/it]

start: 2022-11-01 00:00:00, end: 2022-11-30 00:00:00
(44756291, 5)


 85%|████████▍ | 11/13 [13:12<02:24, 72.19s/it]

start: 2022-12-01 00:00:00, end: 2022-12-31 00:00:00
(9262981, 5)


 92%|█████████▏| 12/13 [13:47<01:00, 60.88s/it]

start: 2023-01-01 00:00:00, end: 2023-01-31 00:00:00
(0, 5)


100%|██████████| 13/13 [13:58<00:00, 64.49s/it]

CPU times: total: 13min 59s
Wall time: 13min 58s





((31130494, 8),
 (65577394, 8),
 (130019962, 8),
 (7472391, 8),
 (7472391, 8),
 (7472391, 8))

In [None]:
assert False, "Биннанизация гео индексов"

In [112]:
%%time
union_geo_by_clients_4_df.to_csv(PATH_DATASET_OUTPUT + 'union_geo_by_clients_4_df.csv', index=False)
union_geo_by_clients_5_df.to_csv(PATH_DATASET_OUTPUT + 'union_geo_by_clients_5_df.csv', index=False)
union_geo_by_clients_6_df.to_csv(PATH_DATASET_OUTPUT + 'union_geo_by_clients_6_df.csv', index=False)

clients_top_geohash_4_df.to_csv(PATH_DATASET_OUTPUT + 'clients_top_geohash_4_df.csv', index=False)
clients_top_geohash_5_df.to_csv(PATH_DATASET_OUTPUT + 'clients_top_geohash_5_df.csv', index=False)
clients_top_geohash_6_df.to_csv(PATH_DATASET_OUTPUT + 'clients_top_geohash_6_df.csv', index=False)

CPU times: total: 14min 50s
Wall time: 15min 4s


# Подвал

In [38]:
%%time
# geohash_df = select_mon_geo_df.groupby('client_id').agg(
#                                 count_trx = ('client_id', len),
#                                 uniq_clients = ('client_id', pd.Series.nunique),
#     geohash_4
#                                       )
tmp_df = select_mon_geo_df.groupby(['client_id','geohash_4']).size().reset_index(name='freq')
tmp_df

CPU times: total: 9 s
Wall time: 9.02 s


Unnamed: 0,client_id,geohash_4,freq
0,000006265d27d1166ed67506682be7380007a5bead4362...,36164,1
1,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3531,97
2,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,17663,9
3,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,21721,4
4,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,22008,2
...,...,...,...
2452602,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,5038,2
2452603,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,8780,3
2452604,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,13471,7
2452605,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,17708,22


In [39]:
# tmp_df['client_id'][-1:].item()
tmp_df[tmp_df['client_id'] == 'fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300']

Unnamed: 0,client_id,geohash_4,freq
2452601,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,4465,1
2452602,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,5038,2
2452603,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,8780,3
2452604,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,13471,7
2452605,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,17708,22
2452606,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,24250,151


In [47]:
tmp_df_2 = tmp_df.groupby(['client_id']).agg(
    cnt_geo = ('freq', len),
    sum_geo_trx = ('freq', sum),
)
tmp_df_2

  tmp_df_2 = tmp_df.groupby(['client_id']).agg(


Unnamed: 0_level_0,cnt_geo,sum_geo_trx
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000006265d27d1166ed67506682be7380007a5bead4362f0a9795f7d97fb08e3,1,1
00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3c8489fab384184d5ee,8,153
00011c01bb22d8f62d9655f32d123dcca5ae55179f8266bdb8676e25321e8477,4,57
000120ff37fb0179d54e94c6e7266c92f3e3c3c630223fb480807226e6024101,4,55
000128505752a589529a45682553d457fe827dc5250f8664b94354c6ecc7f6f6,1,3
...,...,...
ffffa99ee602d379ea65e0fbdbfb0c82ed074e28cd3adac2978cd07d26fb8bf4,1,32
ffffa9af8a057b55b18af946e157391cd9f5a5fd9b61cbbdef96c26116f04379,3,28
ffffd4051d83a66a3dd0047c4f3c79902a4ef5df0878efdcefc1381f9907d993,1,7
ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cbe1ccc15ab6b519efb3,1,3


In [54]:
tmp_df_3 = tmp_df.merge(tmp_df_2.reset_index(), on='client_id', how='left')
tmp_df_3['prc_use'] = tmp_df_3['freq'] / tmp_df_3['sum_geo_trx']
tmp_df_3

Unnamed: 0,client_id,geohash_4,freq,cnt_geo,sum_geo_trx,prc_use
0,000006265d27d1166ed67506682be7380007a5bead4362...,36164,1,1,1,1.000000
1,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3531,97,8,153,0.633987
2,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,17663,9,8,153,0.058824
3,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,21721,4,8,153,0.026144
4,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,22008,2,8,153,0.013072
...,...,...,...,...,...,...
2452602,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,5038,2,6,186,0.010753
2452603,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,8780,3,6,186,0.016129
2452604,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,13471,7,6,186,0.037634
2452605,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,17708,22,6,186,0.118280


In [55]:
# tmp_df_3.iloc[2]['client_id']
tmp_df_3[tmp_df_3['client_id'] == '00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3c8489fab384184d5ee']

Unnamed: 0,client_id,geohash_4,freq,cnt_geo,sum_geo_trx,prc_use
1,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,3531,97,8,153,0.633987
2,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,17663,9,8,153,0.058824
3,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,21721,4,8,153,0.026144
4,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,22008,2,8,153,0.013072
5,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,24286,7,8,153,0.045752
6,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,41582,14,8,153,0.091503
7,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,46370,13,8,153,0.084967
8,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,50130,7,8,153,0.045752


In [106]:
%%time
client_geo_df = select_mon_geo_df.groupby(['client_id','geohash_4']).size().reset_index(name='freq')

geohash_4_df = client_geo_df.groupby(['client_id']).agg(
    cnt_geo = ('freq', len),
    sum_geo_trx = ('freq', sum),
)

geo_4_by_clients = geohash_4_df.reset_index().merge(client_geo_df, on='client_id', how='left')
geo_4_by_clients['prc_use'] = geo_4_by_clients['freq'] / geo_4_by_clients['sum_geo_trx']

popular_geo_4_by_clients = geo_4_by_clients.loc[geo_4_by_clients.groupby('client_id')['prc_use'].transform('max').eq(geo_4_by_clients['prc_use'])]
# Могут быть ситуации когда одинаковые геохеши имеют одинаковый вес, тогда выбираем первый
popular_geo_4_by_clients = popular_geo_4_by_clients.drop_duplicates(subset=['client_id', 'prc_use'])
popular_geo_4_by_clients.shape



CPU times: total: 16.6 s
Wall time: 16.6 s


(661187, 6)

In [107]:
# geo_4_by_clients = geohash_4_df.reset_index().merge(client_geo_df, on='client_id', how='left')
# geo_4_by_clients['prc_use'] = geo_4_by_clients['freq'] / geo_4_by_clients['sum_geo_trx']

# popular_geo_4_by_clients = geo_4_by_clients.loc[geo_4_by_clients.groupby('client_id')['prc_use'].transform('max').eq(geo_4_by_clients['prc_use'])]
# # # Могут быть ситуации когда одинаковые геохеши имеют одинаковый вес, тогда выбираем первый
# # popular_geo_4_by_clients = popular_geo_4_by_clients.drop_duplicates(subset=['client_id', 'prc_use'])
# # popular_geo_4_by_clients.shape

In [90]:
# popular_geo_4_by_clients = geo_4_by_clients.loc[geo_4_by_clients.groupby('client_id')['prc_use'].transform('max').eq(geo_4_by_clients['prc_use'])]
# popular_geo_4_by_clients.shape

(678469, 6)

In [65]:
popular_geo_4_by_clients['client_id'].shape, popular_geo_4_by_clients['client_id'].unique().shape

((678469,), (661187,))

In [81]:
# geo_4_by_clients.shape
geo_4_by_clients = geohash_4_df.reset_index().merge(client_geo_df, on='client_id', how='left')
geohash_4_df.shape, client_geo_df.shape, geo_4_by_clients.shape
# geohash_4_df.reset_index()['client_id'].unique().shape

((661187, 2), (2452607, 3), (2452607, 5))

In [75]:
client_geo_df = select_mon_geo_df.groupby(['client_id','geohash_4']).size().reset_index(name='freq')
geohash_4_df = client_geo_df.groupby(['client_id']).agg(
    cnt_geo = ('freq', len),
    sum_geo_trx = ('freq', sum),
)
geohash_4_df.shape

  geohash_4_df = client_geo_df.groupby(['client_id']).agg(


(661187, 2)

In [98]:
popular_geo_4_by_clients[popular_geo_4_by_clients.duplicated(subset=['client_id', 'prc_use'])]['client_id'][:1].item()

'000336c3931ed18c3f7802910502fc0d5f00b0befa48f47e33c7e04e9a6babde'

In [101]:
popular_geo_4_by_clients[popular_geo_4_by_clients['client_id'] == '000336c3931ed18c3f7802910502fc0d5f00b0befa48f47e33c7e04e9a6babde']

Unnamed: 0,client_id,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use
98,000336c3931ed18c3f7802910502fc0d5f00b0befa48f4...,2,2,3531,1,0.5
99,000336c3931ed18c3f7802910502fc0d5f00b0befa48f4...,2,2,16410,1,0.5


In [100]:
ttt = popular_geo_4_by_clients.drop_duplicates(subset=['client_id', 'prc_use'])
ttt[ttt['client_id'] == '000336c3931ed18c3f7802910502fc0d5f00b0befa48f47e33c7e04e9a6babde']

Unnamed: 0,client_id,cnt_geo,sum_geo_trx,geohash_4,freq,prc_use
98,000336c3931ed18c3f7802910502fc0d5f00b0befa48f4...,2,2,3531,1,0.5


In [40]:
tmp_df_2 = tmp_df.groupby(['client_id']).size().reset_index(name='cnt_geo')
tmp_df_2

Unnamed: 0,client_id,cnt_geo
0,000006265d27d1166ed67506682be7380007a5bead4362...,1
1,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,8
2,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,4
3,000120ff37fb0179d54e94c6e7266c92f3e3c3c630223f...,4
4,000128505752a589529a45682553d457fe827dc5250f86...,1
...,...,...
661182,ffffa99ee602d379ea65e0fbdbfb0c82ed074e28cd3ada...,1
661183,ffffa9af8a057b55b18af946e157391cd9f5a5fd9b61cb...,3
661184,ffffd4051d83a66a3dd0047c4f3c79902a4ef5df0878ef...,1
661185,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,1


In [None]:
tmp_df_2

In [41]:
# tmp_df['client_id'][-1:].item()
tmp_df_2[tmp_df_2['client_id'] == 'fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053ab70fb6c7e740a70300']

Unnamed: 0,client_id,cnt_geo
661186,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,6


In [43]:
# tmp_df_2[tmp_df_2['cnt_geo'] < 10].hist(bins=100)
tmp_df_2['cnt_geo'].median()

3.0

In [46]:
tmp_df['freq'].median()

5.0

In [101]:
# start_date = start_date + relativedelta(months=1)
# end_date = start_date + relativedelta(months=1) - relativedelta(days=1)
print(f'start: {start_date}, end: {end_date}')
print(f'start: {start_date  + relativedelta(months=1)}, end: {end_date  + relativedelta(months=1)}')

start: 2022-02-01 00:00:00, end: 2022-02-28 00:00:00
start: 2022-03-01 00:00:00, end: 2022-03-28 00:00:00


In [None]:
select_mon_geo_df

In [108]:
train_client_by_cur_mon_df = train_target_df[pd.to_datetime(train_target_df['mon']).between(start_date, start_date + relativedelta(months=1) - relativedelta(days=1))]
train_client_by_next_mon_df = train_target_df[pd.to_datetime(train_target_df['mon']).between(start_date + relativedelta(months=1) , start_date + relativedelta(months=2) - relativedelta(days=1) )]
current_mon_train_df = train_client_by_cur_mon_df.rename(columns={'mon':'mon_report'}).add_prefix('cur_mon_')
next_mon_train_df = train_client_by_next_mon_df.rename(columns={'mon':'mon_report'}).add_prefix('next_mon_')

train_client_by_cur_mon_df.shape, train_client_by_next_mon_df.shape

((853892, 6), (853892, 6))

In [162]:
start_date, start_date + relativedelta(months=1) - relativedelta(days=1)

(datetime.datetime(2022, 2, 1, 0, 0), datetime.datetime(2022, 2, 28, 0, 0))

In [121]:
# Рассчитываем уровень "продоваемости продукта" относительно геопозиции
# Связываем
client_geo_4_df = select_mon_geo_df[['client_id', 'geohash_4']].drop_duplicates()
cur_mon_select_geo4__target_df = client_geo_4_df.merge(current_mon_train_df, left_on='client_id', right_on='cur_mon_client_id', how='left')
cur_mon_select_geo4__target_df = cur_mon_select_geo4__target_df.fillna(0)
client_geo_4_df.shape, cur_mon_select_geo4__target_df.shape

((2452607, 2), (2452607, 8))

In [169]:
cur_mon_select_geo4__target_df[cur_mon_select_geo4__target_df['geohash_4'] == 50183]['cur_mon_target_1'].sum()

8.0

In [173]:
uniq_list = cur_mon_select_geo4__target_df[cur_mon_select_geo4__target_df['geohash_4'] == 50183]['client_id'].unique()
len(uniq_list)

764

In [183]:
%%time
tmp_df = cur_mon_select_geo4__target_df.groupby('geohash_4').agg(
                                    cur_sum_target_1 = ('cur_mon_target_1', sum),
                                    cur_sum_target_2 = ('cur_mon_target_2', sum),
                                    cur_sum_target_3 = ('cur_mon_target_3', sum),
                                    cur_sum_target_4 = ('cur_mon_target_4', sum),
    
                                    cur_median_target_1 = ('cur_mon_target_1', np.median),
                                    cur_median_target_2 = ('cur_mon_target_2', np.median),
                                    cur_median_target_3 = ('cur_mon_target_3', np.median),
                                    cur_median_target_4 = ('cur_mon_target_4', np.median),
    
                                    cur_var_target_1 = ('cur_mon_target_1', 'var'),
                                    cur_var_target_2 = ('cur_mon_target_2', 'var'),
                                    cur_var_target_3 = ('cur_mon_target_3', 'var'),
                                    cur_var_target_4 = ('cur_mon_target_4', 'var'),
                                    
                                    uniq_clients = ('client_id', pd.Series.nunique),
)
tmp_df = tmp_df.reset_index()
tmp_df = tmp_df.fillna(0)
# Расчитываем значение "популярности" геохешей относительно кол-ва клиентов
columns = ['cur_sum_target_1', 'cur_sum_target_2', 'cur_sum_target_3', 'cur_sum_target_4', 'cur_median_target_1', 'cur_median_target_2', 'cur_median_target_3', 'cur_median_target_4', 'cur_var_target_1', 'cur_var_target_2', 'cur_var_target_3', 'cur_var_target_4', ]
for col in columns:
    tmp_df[f'{col}__by_clients'] = tmp_df[col] / tmp_df['uniq_clients']
    
tmp_df.shape



CPU times: total: 1.33 s
Wall time: 1.35 s


(19728, 26)

In [191]:
union_geohash_4_df.reset_index().merge(tmp_df, on='geohash_4', how='left')

Unnamed: 0,geohash_4,count_trx,uniq_clients_x,report_end,report_next_end,cur_sum_target_1,cur_sum_target_2,cur_sum_target_3,cur_sum_target_4,cur_median_target_1,cur_median_target_2,cur_median_target_3,cur_median_target_4,cur_var_target_1,cur_var_target_2,cur_var_target_3,cur_var_target_4,uniq_clients_y,cur_sum_target_1__by_clients,cur_sum_target_2__by_clients,cur_sum_target_3__by_clients,cur_sum_target_4__by_clients,cur_median_target_1__by_clients,cur_median_target_2__by_clients,cur_median_target_3__by_clients,cur_median_target_4__by_clients,cur_var_target_1__by_clients,cur_var_target_2__by_clients,cur_var_target_3__by_clients,cur_var_target_4__by_clients
0,2,26,11,2022-02-28,2022-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,11,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,3,64,29,2022-02-28,2022-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,29,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,4,34,3,2022-02-28,2022-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,6,17,1,2022-02-28,2022-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,11,28,3,2022-02-28,2022-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,50180,1918,123,2022-02-28,2022-03-31,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008130,0.000000,0.000000,0.000000,123,0.008130,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000066,0.000000,0.000000,0.000000
19724,50183,5135,764,2022-02-28,2022-03-31,8.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,0.010375,0.002614,0.010375,0.002614,764,0.010471,0.002618,0.010471,0.002618,0.0,0.0,0.0,0.0,0.000014,0.000003,0.000014,0.000003
19725,50184,2,1,2022-02-28,2022-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
19726,50185,143,3,2022-02-28,2022-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000


In [189]:
tmp_df

Unnamed: 0,geohash_4,cur_sum_target_1,cur_sum_target_2,cur_sum_target_3,cur_sum_target_4,cur_median_target_1,cur_median_target_2,cur_median_target_3,cur_median_target_4,cur_var_target_1,cur_var_target_2,cur_var_target_3,cur_var_target_4,uniq_clients,cur_sum_target_1__by_clients,cur_sum_target_2__by_clients,cur_sum_target_3__by_clients,cur_sum_target_4__by_clients,cur_median_target_1__by_clients,cur_median_target_2__by_clients,cur_median_target_3__by_clients,cur_median_target_4__by_clients,cur_var_target_1__by_clients,cur_var_target_2__by_clients,cur_var_target_3__by_clients,cur_var_target_4__by_clients
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,11,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,29,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,50180,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008130,0.000000,0.000000,0.000000,123,0.008130,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000066,0.000000,0.000000,0.000000
19724,50183,8.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,0.010375,0.002614,0.010375,0.002614,764,0.010471,0.002618,0.010471,0.002618,0.0,0.0,0.0,0.0,0.000014,0.000003,0.000014,0.000003
19725,50184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
19726,50185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000


In [181]:
columns = ['cur_sum_target_1', 'cur_sum_target_2', 'cur_sum_target_3', 'cur_sum_target_4', 'cur_median_target_1', 'cur_median_target_2', 'cur_median_target_3', 'cur_median_target_4', 'cur_var_target_1', 'cur_var_target_2', 'cur_var_target_3', 'cur_var_target_4', ]
for col in columns:
    tmp_df[f'{col}__by_clients'] = tmp_df[col] / tmp_df['uniq_clients']

In [133]:
# client_geo_4_df['geohash_4'].unique()#.shape

array([22879, 39879, 21721, ..., 23719, 30793,  6675])

In [178]:
# tmp_df[tmp_df['geohash_4'] == 21721]
# 50183

select_mon_geo_df.loc[50183]['client_id'].values

array(['b36b56a54f0523659354b074803721616fc4653a0726ca0b2af9b34c5d48ec9d',
       '4470eb7942a777492c724e49d65d43359aa952414d8892f8b0e4e19a5340dc0f',
       'c7d48ecc5c1316a998a4703305d840456d9570ed6a31ca836b5c9aad44bf885a',
       '149e6e0908465d21b5fe7bf91b4716b5503fcd9dec0ae1052d8f865a2af69232',
       'c14c25caab4b5996315ca6c4d0220d2eb461006f43bd462220668be75c341cef',
       'db476883253d903704f932c9be597da3dcbad32871cac2df220b8df6bc6f1833',
       '6dfd8527f394794dd185af835857c5ad39d711ae1542ca5c7f6357ac07d93bb8'],
      dtype=object)

In [176]:
train_target_df[
    (train_target_df['mon'] == '2022-02-28')&
    (train_target_df['client_id'].isin(uniq_list))]['target_1'].sum()

8

In [182]:
tmp_df.fillna(0)

Unnamed: 0,geohash_4,cur_sum_target_1,cur_sum_target_2,cur_sum_target_3,cur_sum_target_4,cur_median_target_1,cur_median_target_2,cur_median_target_3,cur_median_target_4,cur_var_target_1,cur_var_target_2,cur_var_target_3,cur_var_target_4,count_trx,uniq_clients,cur_sum_target_1__by_clients,cur_sum_target_2__by_clients,cur_sum_target_3__by_clients,cur_sum_target_4__by_clients,cur_median_target_1__by_clients,cur_median_target_2__by_clients,cur_median_target_3__by_clients,cur_median_target_4__by_clients,cur_var_target_1__by_clients,cur_var_target_2__by_clients,cur_var_target_3__by_clients,cur_var_target_4__by_clients
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,11,11,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,29,29,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,50180,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008130,0.000000,0.000000,0.000000,123,123,0.008130,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000066,0.000000,0.000000,0.000000
19724,50183,8.0,2.0,8.0,2.0,0.0,0.0,0.0,0.0,0.010375,0.002614,0.010375,0.002614,764,764,0.010471,0.002618,0.010471,0.002618,0.0,0.0,0.0,0.0,0.000014,0.000003,0.000014,0.000003
19725,50184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
19726,50185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3,3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000


In [93]:
train_client_by_mon_df

Unnamed: 0,mon,target_1,target_2,target_3,target_4,client_id
8,2022-02-28,0,0,0,0,1d55174bce3ef488233380aca206ca2fb51661b1c46495...
20,2022-02-28,0,0,0,0,1d5d052f87d6bd22a30b5df160fffdcdd11014feffe75c...
32,2022-02-28,0,0,0,0,1d68b588164639d64879b33da867818ea7401b6eb7caff...
44,2022-02-28,0,0,0,0,1d817e82e1cc594e32e10217b716faa3ad079261975097...
56,2022-02-28,0,0,0,0,1d87528929da4c0107c498bbac5bec8bf43d0b99440097...
...,...,...,...,...,...,...
1024614,2022-02-28,0,0,0,0,ffade23e54114463d2271842ab88b38c0225cbe934f365...
1024626,2022-02-28,0,0,0,0,ffbba08772b67a2a7b057a3f616f90b0601c34d5032757...
1024638,2022-02-28,0,0,0,0,ffc09c3decd0f53763e4737accf686bade81f1fc86da94...
1024650,2022-02-28,0,0,0,0,ffc0a010061b30747ae785141ba9cba2f149f331a48f7f...


In [92]:
geohash_4_df

Unnamed: 0_level_0,count_trx,uniq_clients,report_end,report_next_end
geohash_4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,10,8,2022-01-31,2022-02-28
3,58,26,2022-01-31,2022-02-28
4,80,2,2022-01-31,2022-02-28
5,1,1,2022-01-31,2022-02-28
6,12,2,2022-01-31,2022-02-28
...,...,...,...,...
50180,1345,143,2022-01-31,2022-02-28
50183,4472,734,2022-01-31,2022-02-28
50185,160,2,2022-01-31,2022-02-28
50186,290,66,2022-01-31,2022-02-28


In [None]:
train_target_df

In [214]:
union_geohash_4_df.to_csv('union_geohash_4_df.csv')
union_geohash_5_df.to_csv('union_geohash_5_df.csv')
union_geohash_6_df.to_csv('union_geohash_6_df.csv')

In [209]:
%%time
# select_mon_geo_df.shape


CPU times: total: 9.25 s
Wall time: 9.26 s


In [210]:
geohash_4

Unnamed: 0_level_0,count_trx,uniq_clients,report_end,report_next_end
geohash_4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,10,8,2022-01-31,2022-03-31
3,58,26,2022-01-31,2022-03-31
4,80,2,2022-01-31,2022-03-31
5,1,1,2022-01-31,2022-03-31
6,12,2,2022-01-31,2022-03-31
...,...,...,...,...
50180,1345,143,2022-01-31,2022-03-31
50183,4472,734,2022-01-31,2022-03-31
50185,160,2,2022-01-31,2022-03-31
50186,290,66,2022-01-31,2022-03-31


In [None]:
19642 

In [207]:
select_mon_geo_df['geohash_4'].unique().shape

(19642,)

In [205]:
select_mon_geo_df

Unnamed: 0,client_id,event_time,geohash_4,geohash_5,geohash_6
62,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-28 06:43:46.598910,39879,144891,1959174
198,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-17 18:54:27.214346,21721,317754,616291
199,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-13 19:48:49.476810,21721,317754,616291
202,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-16 14:17:40.073260,21721,317754,616291
203,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-29 13:29:11.601840,21721,317754,616291
...,...,...,...,...,...
3523308,6957505c29fb2ad50eec3512e9fc416527a3b1844d6d39...,2022-01-24 10:31:17.144302,22879,88678,1466484
3523381,6957505c29fb2ad50eec3512e9fc416527a3b1844d6d39...,2022-01-18 12:14:23.198445,21721,365300,9079
3523382,6957505c29fb2ad50eec3512e9fc416527a3b1844d6d39...,2022-01-21 14:48:50.397522,22879,88678,1466484
3523568,6957505c29fb2ad50eec3512e9fc416527a3b1844d6d39...,2022-01-24 11:12:30.132430,22879,88678,1466484


In [121]:
len(set(test_target_df['client_id']) & set(train_target_df['client_id']))

0

In [123]:
test_target_df['mon'].min(), test_target_df['mon'].max()

('2022-02-28', '2022-12-31')

In [124]:
train_target_df['mon'].min(), train_target_df['mon'].max()

('2022-02-28', '2023-01-31')

In [132]:
# train_0_df

In [126]:
train_0_df = pq.read_table(train_geo_files[0]).to_pandas()
train_1_df = pq.read_table(train_geo_files[1]).to_pandas()
train_0_df.shape, train_1_df.shape

((17331967, 5), (17331967, 5))

In [135]:
set(train_0_df['client_id'].values) & set(train_1_df['client_id'].values)

{'154eb088f55cad189cb2e1ef24cd37b5948e18da74fba9693f5282659a90975f',
 'b0b0254cbf57319bf956e9c1fee69f21e8be624876305f7b99decb51d528bb3c',
 'c973ab601639ae5abca8140ca1ed7db84138c43e3bdf3c8b37e76951e2e5ef59'}

In [145]:
# %%time
# geo_target_test_df = load_df_by_files(train_geo_files)
# geo_target_test_df.shape

In [88]:
528_278_383
geo_target_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 528278383 entries, 0 to 17331966
Data columns (total 5 columns):
 #   Column      Dtype         
---  ------      -----         
 0   client_id   object        
 1   event_time  datetime64[us]
 2   geohash_4   int32         
 3   geohash_5   int32         
 4   geohash_6   int32         
dtypes: datetime64[us](1), int32(3), object(1)
memory usage: 17.7+ GB


In [89]:
geo_target_df

Unnamed: 0,client_id,event_time,geohash_4,geohash_5,geohash_6
0,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-27 09:56:36.271169,39879,144891,1959174
1,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-14 07:13:23.011804,39879,144891,1959174
2,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-02 07:46:18.278369,39879,144891,1959174
3,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-19 08:47:39.973788,39879,144891,1959174
4,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-19 10:15:14.676360,39879,144891,1959174
...,...,...,...,...,...
17331962,3ec6d4ae2429af5415106045c48d0794bb28980167260c...,2022-03-14 16:55:11.458273,14052,217129,363467
17331963,3ec6d4ae2429af5415106045c48d0794bb28980167260c...,2022-02-28 14:11:40.768307,14052,217129,363467
17331964,3ec6d4ae2429af5415106045c48d0794bb28980167260c...,2022-03-18 17:35:14.064008,14052,217129,363467
17331965,3ec6d4ae2429af5415106045c48d0794bb28980167260c...,2022-09-20 14:15:09.265986,14052,217129,363467


In [90]:
geo_target_df['geohash_4'].unique().shape

(35302,)

In [86]:
train_target_df['mon'].min()

'2022-02-28'

In [None]:
train_0_df

In [None]:
# target_df


In [11]:
train_0_df = pq.read_table(train_geo_files[0]).to_pandas()
train_1_df = pq.read_table(train_geo_files[1]).to_pandas()
train_0_df.shape, train_1_df.shape

((17331967, 5), (17331967, 5))

In [13]:
train_0_df['client_id'].unique().shape, train_1_df['client_id'].unique().shape

((20393,), (20324,))

In [32]:
select_client_geo_df = train_0_df[train_0_df['client_id'] == '309c0e909835757db908884e80f28f2fc76b803904b4b57299b5a5c1bed2c572']
# int('309c0e909835757db908884e80f28f2fc76b803904b4b5', 16)
# train_0_df[0:1]['client_id'].item()
select_client_geo_df['geohash_4'].value_counts()

geohash_4
21721    2513
39879     312
22879     186
25679     152
17663      87
13692      61
7289       52
3531       44
46512      42
25447      25
8026       18
22008      17
23039      17
41370      16
20350      13
21434      13
38599      11
49191      10
31142       9
15361       8
3162        8
11163       7
45178       6
33405       6
46370       4
28756       4
29254       3
41797       2
8119        2
14950       2
29025       2
25933       1
6632        1
Name: count, dtype: int64

In [41]:
select_client_geo_df['geohash_4'].value_counts()

geohash_4
21721    2513
39879     312
22879     186
25679     152
17663      87
13692      61
7289       52
3531       44
46512      42
25447      25
8026       18
22008      17
23039      17
41370      16
20350      13
21434      13
38599      11
49191      10
31142       9
15361       8
3162        8
11163       7
45178       6
33405       6
46370       4
28756       4
29254       3
41797       2
8119        2
14950       2
29025       2
25933       1
6632        1
Name: count, dtype: int64

In [46]:
select_client_geo_df[select_client_geo_df['geohash_5'] == 257065].sort_values(by='event_time')

Unnamed: 0,client_id,event_time,geohash_4,geohash_5,geohash_6
292753,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-12 12:19:29.404373,7289,257065,991114
292751,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-17 13:11:31.912387,7289,257065,991114
292380,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-23 12:27:04.631470,7289,257065,1162282
292757,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-01-25 14:53:16.784231,7289,257065,991114
292755,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-02-03 14:30:49.753075,7289,257065,991114
292785,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-03-10 08:14:59.524077,7289,257065,991114
292807,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-03-24 10:23:01.589790,7289,257065,991114
292801,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-03-29 10:06:25.562505,7289,257065,991114
292805,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-04-06 09:29:14.411359,7289,257065,991114
292797,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-04-06 10:00:46.380238,7289,257065,991114


In [19]:
select_client_geo_df['geohash_4'].hist()

NameError: name 'select_client_geo_df' is not defined

In [None]:
select_client_geo_df['geohash_5'].hist()

In [None]:
select_client_geo_df['geohash_6'].hist()

In [34]:
select_client_geo_df['geohash_6'].value_counts()

geohash_6
159536     894
444951     438
2746097    164
1959174    115
1445334    113
          ... 
605872       1
2233635      1
1272303      1
2159809      1
625622       1
Name: count, Length: 251, dtype: int64

In [15]:
import Geohash

ModuleNotFoundError: No module named 'Geohash'

In [None]:
print 'Geohash for 42.6, -5.6:', Geohash.encode(42.6, -5.6)