In [1]:
##
# This is a preprocessing for raw data sets, the goal of it is to get rid of the useless columns and split data
# sets into pieces by day/hour.
##

import pandas as pd
import numpy as np
import os,sys,gc,time
from datetime import datetime
from contextlib import contextmanager

DataBaseDir = '../../data'
InputDir = '%s/raw' % DataBaseDir
OutputDir = '%s/raw' % DataBaseDir
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
feat_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
time_columns = ['click_time']
target = 'is_attributed'
days = {
    6: list(range(14, 24)),
    7: list(range(0, 24)),
    8: list(range(0, 24)),
    9: list(range(0, 17))
}
local_public_hours = [4, 5, 9, 10, 13, 14]
local_private_hours = [i for i in range(24) if(i not in local_public_hours)]
public_hours = [4, 5, 6, 9, 10, 11, 13, 14, 15]
@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

train_columns = [target]
train_columns.extend(feat_columns)
test_columns = ['click_id']
test_columns.extend(feat_columns)
## for train
chunk_num = 0
for chunk_df in pd.read_csv('%s/train.csv' % InputDir, 
                            dtype= dtypes, 
                            usecols= train_columns, 
                            parse_dates= time_columns, 
                            iterator=True, 
                            chunksize=10000000):
    with timer('preprocessing for chunk %s' % chunk_num):
        #chunk_df = chunk_df.sample(frac= 0.2)
        chunk_df['hour'] = chunk_df['click_time'].dt.hour
        chunk_df['hour'] = chunk_df['hour'].astype(np.uint8)
        chunk_df['day'] = chunk_df['click_time'].dt.day
        chunk_df['day'] = chunk_df['day'].astype(np.uint8)
        chunk_df.drop('click_time', axis= 1, inplace= True)
        
        for d in days.keys():
            for h in days[d]:
                HourOutputDir = '%s/%s/%s' % (OutputDir, d, h)
                if(os.path.exists(HourOutputDir) == False):
                    os.makedirs(HourOutputDir)
                chunk_df[(chunk_df['day'] == d) & (chunk_df['hour'] == h)].to_hdf(path_or_buf= '%s/%s.hdf' % (HourOutputDir, chunk_num),
                                                                                    key= '%s' % (chunk_num), 
                                                                                    mode='w', 
                                                                                    complib='blosc')
        chunk_num += 1
## for test
chunk_num = 0
for chunk_df in pd.read_csv('%s/test.csv' % InputDir, 
                            dtype= dtypes, 
                            usecols= test_columns, 
                            parse_dates= time_columns, 
                            iterator=True, 
                            chunksize=10000000):
    with timer('preprocessing for chunk %s' % chunk_num):
        chunk_df['hour'] = chunk_df['click_time'].dt.hour
        chunk_df['hour'] = chunk_df['hour'].astype(np.uint8)
        chunk_df['day'] = chunk_df['click_time'].dt.day
        chunk_df['day'] = chunk_df['day'].astype(np.uint8)
        chunk_df.drop('click_time', axis= 1, inplace= True)
        d = 10
        for h in public_hours:
            HourOutputDir = '%s/%s/%s' % (OutputDir, d, h)
            if(os.path.exists(HourOutputDir) == False):
                os.makedirs(HourOutputDir)
            chunk_df[(chunk_df['day'] == d) & (chunk_df['hour'] == h)].to_hdf(path_or_buf= '%s/%s.hdf' % (HourOutputDir, chunk_num),
                                                                                    key= '%s' % (chunk_num), 
                                                                                    mode='w', 
                                                                                    complib='blosc')
        chunk_num += 1



[preprocessing for chunk 0] done in 3 s




[preprocessing for chunk 1] done in 3 s




[preprocessing for chunk 2] done in 3 s




[preprocessing for chunk 3] done in 3 s




[preprocessing for chunk 4] done in 3 s




[preprocessing for chunk 5] done in 3 s




[preprocessing for chunk 6] done in 3 s




[preprocessing for chunk 7] done in 3 s




[preprocessing for chunk 8] done in 3 s




[preprocessing for chunk 9] done in 3 s




[preprocessing for chunk 10] done in 3 s




[preprocessing for chunk 11] done in 3 s




[preprocessing for chunk 12] done in 3 s




[preprocessing for chunk 13] done in 3 s




[preprocessing for chunk 14] done in 3 s




[preprocessing for chunk 15] done in 3 s




[preprocessing for chunk 16] done in 3 s




[preprocessing for chunk 17] done in 3 s




[preprocessing for chunk 18] done in 2 s
[preprocessing for chunk 0] done in 2 s
[preprocessing for chunk 1] done in 2 s
