In [2]:
##
# This is feature engineering process based on preprocessed raw data sets.
# 1. add count features, with sampling rate 0.1 on train data set. Updated on 2018/4/2
##

import pandas as pd
import numpy as np
import os,sys,gc,time
from datetime import datetime
from contextlib import contextmanager
import psutil

DataBaseDir = '../../data'
InputDir = '%s/raw' % DataBaseDir
OutputDir = '%s/feat/version1' % DataBaseDir
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'day'           : 'uint8',
        'hour'          : 'uint8', 
        'click_id'      : 'uint32'
        }
feat_columns = ['ip', 'app', 'device', 'os', 'channel', 'day', 'hour']
target = 'is_attributed'
local_public_hours = [4, 5, 9, 10, 13, 14]
local_private_hours = [i for i in range(24) if(i not in local_public_hours)]
public_hours = [4, 5, 6, 9, 10, 11, 13, 14, 15]
days = {
    6: list(range(14, 24)),
    7: list(range(0, 24)),
    8: list(range(0, 24)),
    9: list(range(0, 17)),
    10: public_hours
}
process = psutil.Process(os.getpid())

@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

train_columns = [target]
train_columns.extend(feat_columns)
test_columns = ['click_id']
test_columns.extend(feat_columns)

DataSet = {
    6: pd.DataFrame(columns= train_columns),
    7: pd.DataFrame(columns= train_columns),
    8: pd.DataFrame(columns= train_columns),
    9: pd.DataFrame(columns= train_columns),
    10: pd.DataFrame(columns= test_columns)
}
## load data
with timer('Load data'):
    for d in DataSet.keys():
        for h in days[d]:
            if(d < 10): # for train
                hour_df = pd.DataFrame(columns= train_columns)
            else: # for test
                hour_df = pd.DataFrame(columns= test_columns)
            HourInputDir = '%s/%s/%s' % (InputDir, d, h)
            chunk_files = os.listdir(HourInputDir)
            for f in chunk_files:
                chunk_num = int(f.split('.')[0])
                chunk_df = pd.read_hdf(path_or_buf= '%s/%s' % (HourInputDir, f), key= '%s' % chunk_num)
                if(len(chunk_df) == 0):
                    continue
                if(d < 10): # sample for train
                    chunk_df = chunk_df.sample(frac= 0.1)
                hour_df = pd.concat([hour_df, chunk_df], axis= 0, ignore_index= True)
            DataSet[d] = pd.concat([DataSet[d], hour_df], axis= 0, ignore_index= True)
        print('Day %s done, memory usage %sM '% (d, (int(process.memory_info().rss/1e6))))
# type conversion
for d in DataSet.keys():
    for col in DataSet[d].columns:
        DataSet[d][col] = DataSet[d][col].astype(dtypes[col])
# print(DataSet[7].dtypes)
# for d in DataSet.keys():
#     print('\nday %s' % d)
#     print('------------max ')
#     print(DataSet[d].max(0))
#     print('------------min')
#     print(DataSet[d].min(0))

Day 6 done, memory usage 295M 
Day 7 done, memory usage 933M 
Day 8 done, memory usage 1657M 
Day 9 done, memory usage 2282M 
Day 10 done, memory usage 4171M 
[Load data] done in 66 s


In [5]:
## add new ip feature
with timer('Add new ip feature'):
    uni_ip = {}
    for d in DataSet.keys():
        uni_ip[d] = set(list(DataSet[d]['ip'].unique().astype(np.uint32)))
    for d in range(7, 10):
        uni_ip[d] = uni_ip[d].union(uni_ip[d - 1])
    for d in DataSet.keys():
        if(d == 6):
            DataSet[d]['new_ip'] = (~(DataSet[d]['ip'] > 126420)).astype(np.uint8)
        else:
            DataSet[d]['new_ip'] = (~(DataSet[d]['ip'].isin(uni_ip[d - 1]))).astype(np.uint8)
    #checking
    for d in DataSet.keys():
        print('day %s, new ip rate %.4f' % (d, (DataSet[d]['new_ip'].sum())/len(DataSet[d])))
## add count features
with timer('Add count features'):
    column_combinations = [['ip', 'hour'], ['ip', 'app'], ['ip', 'app', 'os'], ['ip', 'device'], ['app', 'channel']]
    with timer('Add count features'):
        for d in range(6, 11):
            for cc in column_combinations:
                arr_slice = DataSet[d][cc].values
                uni, uni_index, uni_count = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1),return_inverse=True, return_counts=True)
                DataSet[d]['count_%s' % '_'.join(cc)] = uni_count[uni_index]
            print('Day %s done, memory usage %sM '% (d, (int(process.memory_info().rss/1e6))))
## split dataset into two fold
# for fold 1
with timer('Saving for fold %s' % 0):
    FoldOutput = '%s/kfold/%s' % (OutputDir, 0)
    if(os.path.exists(FoldOutput) == False):
        os.makedirs(FoldOutput)
    DataSet[6].to_hdf(path_or_buf= '%s/train_6.hdf' % (FoldOutput), key= 'train_6', mode='w', complib='blosc')
    DataSet[7].to_hdf(path_or_buf= '%s/train_7.hdf' % (FoldOutput), key= 'train_7', mode='w', complib='blosc')
    k1 = 'test_8_public'
    DataSet[8][DataSet[8]['hour'].isin(local_public_hours)].to_hdf(path_or_buf= '%s/%s.hdf' % (FoldOutput, k1), key= k1, mode='w', complib='blosc')
    k2 = 'test_8_private'
    DataSet[8][DataSet[8]['hour'].isin(local_private_hours)].to_hdf(path_or_buf= '%s/%s.hdf' % (FoldOutput, k2), key= k2, mode='w', complib='blosc')
# for fold 2
with timer('Saving for fold %s' % 1):
    FoldOutput = '%s/kfold/%s' % (OutputDir, 1)
    if(os.path.exists(FoldOutput) == False):
        os.makedirs(FoldOutput)
    DataSet[6].to_hdf(path_or_buf= '%s/train_6.hdf' % (FoldOutput), key= 'train_6', mode='w', complib='blosc')
    DataSet[7].to_hdf(path_or_buf= '%s/train_7.hdf' % (FoldOutput), key= 'train_7', mode='w', complib='blosc')
    DataSet[8].to_hdf(path_or_buf= '%s/train_8.hdf' % (FoldOutput), key= 'train_8', mode='w', complib='blosc')
    k1 = 'test_9_public'
    DataSet[9][DataSet[9]['hour'].isin(local_public_hours)].to_hdf(path_or_buf= '%s/%s.hdf' % (FoldOutput, k1), key= k1, mode='w', complib='blosc')
    k2 = 'test_9_private'
    DataSet[9][DataSet[9]['hour'].isin(local_private_hours)].to_hdf(path_or_buf= '%s/%s.hdf' % (FoldOutput, k2), key= k2, mode='w', complib='blosc')
# for fold 3
with timer('Saving for fold %s' % 2):
    FoldOutput = '%s/kfold/%s' % (OutputDir, 2)
    if(os.path.exists(FoldOutput) == False):
        os.makedirs(FoldOutput)
    DataSet[6].to_hdf(path_or_buf= '%s/train_6.hdf' % (FoldOutput), key= 'train_6', mode='w', complib='blosc')
    DataSet[7].to_hdf(path_or_buf= '%s/train_7.hdf' % (FoldOutput), key= 'train_7', mode='w', complib='blosc')
    DataSet[8].to_hdf(path_or_buf= '%s/train_8.hdf' % (FoldOutput), key= 'train_8', mode='w', complib='blosc')
    DataSet[9].to_hdf(path_or_buf= '%s/train_9.hdf' % (FoldOutput), key= 'train_9', mode='w', complib='blosc')
    k1 = 'test_10_public'
    DataSet[10].to_hdf(path_or_buf= '%s/%s.hdf' % (FoldOutput, k1), key= k1, mode='w', complib='blosc')

6 0.778740451004
7 0.0511838012708
8 0.0345817447787
9 0.0300358828842
10 0.0814636398911


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
