In [1]:
import pandas as pd
import numpy as np
import gc, os, re

In [2]:
from datetime import datetime

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
def icecream(text):
    print(f'{datetime.now()}> {str(text)}')

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [17]:
def reduce_column(df):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_type = str(df.dtypes)
    if col_type in numerics:
        c_min = df.min()
        c_max = df.max()
        if col_type[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df = df.astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df = df.astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.astype(np.float32)
            else:
                df = df.astype(np.float64)    
    return df

In [7]:
# clickstream.zip -   выборка с поведенческими данными из цифровых каналов

# timestamp - дата и время совершения события
# application_id - идентификатор приложения
# client	- Идентификатор клиента 
# session_id - Идентификатор сессии
# event_type - Тип события
# event_category - Категория события
# event_name - Имя события
# event_label - Дополнительный атрибут события
# device_screen_name - Имя экрана на котором произошло событие
# timezone - Часовой пояс
# device_is_webview - Флаг того что страница открыта внутри webview
# page_urlhost - Домен страницы
# page_urlpath_full - Путь страницы
# net_connection_type - Тип подключения
# net_connection_tech - Технология подключения

# prediction_session_timestamp.csv -  выборка для построения прогноза 
# abattle_train_target.csv -  выборка для обучения 

# client - Идентификатор клиента 
# session_id - Идентификатор сессии
# timestamp - Время начала сессии
# target - Целевое действие внутри сессии, multi-class переменная

# abattle_sample_prediction.csv -  сэмпл сабмит.

In [19]:
if not os.path.isdir('./abattle_clickstream'):
    os.mkdir('./abattle_clickstream')
if not os.path.isdir('./encode'):
    os.mkdir('./encode')

In [20]:
### Разделение поля timezone на continent и town

In [15]:
%%time
for dirname, _, filenames in os.walk('./alfabattle2_abattle_clickstream'):
    filenames = sorted(filenames)
    for filename in filenames:
        icecream(filename)
        df = pd.read_parquet(os.path.join(dirname, filename))
        df['zone_continent'] = df['timezone'].apply(lambda x: re.split(r'/', str(x).lower())[0])
        df['zone_town'] = df['timezone'].apply(lambda x: re.split(r'/', str(x).lower()+'/')[1])
        df.to_pickle(os.path.join('./abattle_clickstream', filename + '.pkl'))
        del df
        gc.collect()

2021-01-23 20:27:00.309096> part-00000.parquet
2021-01-23 20:28:20.794470> part-00001.parquet
2021-01-23 20:29:38.508911> part-00002.parquet
2021-01-23 20:30:53.877683> part-00003.parquet
2021-01-23 20:32:12.429172> part-00004.parquet
2021-01-23 20:33:30.122682> part-00005.parquet
2021-01-23 20:34:46.522572> part-00006.parquet
2021-01-23 20:36:05.789445> part-00007.parquet
2021-01-23 20:37:24.946721> part-00008.parquet
2021-01-23 20:38:40.875639> part-00009.parquet
CPU times: user 13min 7s, sys: 2min 11s, total: 15min 18s
Wall time: 12min 57s


In [None]:
# Кодирование с учетом сортировки. В результате алгоритма, будут переписаны файлы датасета, 
# поля кодированы. Для восстановления и/или анализа, в папке encode будут файлы с результатом кодирования 
# строк по каждому полю (14 файлов)

In [18]:
%%time
cols = ['application_id', 'event_type', 'event_category', 'event_name', 
        'event_label', 'device_screen_name', 'timezone', 'device_is_webview', 
        'page_urlhost', 'page_urlpath_full', 'net_connection_type', 
        'net_connection_tech', 'zone_continent', 'zone_town']
for col in cols:
    icecream(col)
    encode = pd.DataFrame(columns=[col])
    for dirname, _, filenames in os.walk('./abattle_clickstream'):
        filenames = sorted(filenames)
        for filename in filenames:
            df = pd.read_pickle(os.path.join(dirname, filename))[[col]]
            encode = encode.append(df)
            del df
            gc.collect()
    encode = encode.reset_index(drop=True)

    encode = encode[col].value_counts().to_frame().\
      reset_index().sort_values(col).reset_index(drop=True)
    encode.rename(columns = {"index":col, col:"count"}, inplace = True)
    col_encode = col + '_encode'
    encode[col_encode] = pd.factorize(encode[col])[0]
    encode[col_encode] += 1
    encode = reduce_mem_usage(encode)
    encode.to_pickle(f'./encode/{col_encode}.pkl')

    for dirname, _, filenames in os.walk('./abattle_clickstream'):
        filenames = sorted(filenames)
        for filename in filenames:
            df = pd.read_pickle(os.path.join(dirname, filename))
            df = df.merge(encode[[col, col_encode]], on=col, how='left')
            df[col_encode] = df[col_encode].fillna(-1)
            del df[col]
            df[col_encode] = df[col_encode].astype('int').pipe(reduce_column)
            df.to_pickle(os.path.join(dirname, filename))
            del df
            gc.collect()

2021-01-23 20:42:00.531505> application_id
Mem. usage decreased to  0.00 Mb (37.2% reduction)
2021-01-23 20:52:47.772727> event_type
Mem. usage decreased to  0.00 Mb (22.2% reduction)
2021-01-23 21:03:28.560809> event_category
Mem. usage decreased to  0.01 Mb (41.1% reduction)
2021-01-23 21:13:48.099531> event_name
Mem. usage decreased to  1.27 Mb (33.3% reduction)
2021-01-23 21:23:57.809298> event_label
Mem. usage decreased to  0.56 Mb (33.3% reduction)
2021-01-23 21:33:59.212915> device_screen_name
Mem. usage decreased to  0.01 Mb (41.3% reduction)
2021-01-23 21:43:42.705174> timezone
Mem. usage decreased to  0.00 Mb (40.9% reduction)
2021-01-23 21:53:04.629486> page_urlhost
Mem. usage decreased to  0.00 Mb (32.5% reduction)
2021-01-23 22:02:14.851101> page_urlpath_full
Mem. usage decreased to  2.01 Mb (33.3% reduction)
2021-01-23 22:11:09.865121> net_connection_type
Mem. usage decreased to  0.00 Mb (16.5% reduction)
2021-01-23 22:20:01.155479> net_connection_tech
Mem. usage decrease

In [25]:
os.listdir('./abattle_clickstream'), os.listdir('./encode')

(['part-00000.parquet.pkl',
  'part-00008.parquet.pkl',
  'part-00009.parquet.pkl',
  'part-00004.parquet.pkl',
  'part-00006.parquet.pkl',
  'part-00003.parquet.pkl',
  'part-00005.parquet.pkl',
  'part-00007.parquet.pkl',
  'part-00002.parquet.pkl',
  'part-00001.parquet.pkl'],
 ['event_label_encode.pkl',
  'event_type_encode.pkl',
  'zone_town_encode.pkl',
  'application_id_encode.pkl',
  'zone_continent_encode.pkl',
  'device_screen_name_encode.pkl',
  'timezone_encode.pkl',
  'client_encode.pkl',
  'net_connection_type_encode.pkl',
  'page_urlhost_encode.pkl',
  'net_connection_tech_encode.pkl',
  'page_urlpath_full_encode.pkl',
  'device_is_webview_encode.pkl',
  'event_name_encode.pkl',
  'event_category_encode.pkl'])