In [3]:
import pandas as pd
import numpy as np
import os,sys,gc,time
from datetime import datetime
from contextlib import contextmanager

DataBaseDir = '../../data'
InputDir = '%s/raw' % DataBaseDir
OutputDir = '%s/raw/version1' % DataBaseDir
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }
feat_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
time_columns = ['click_time']
target = 'is_attributed'
days = {
    6: list(range(14, 24)),
    7: list(range(0, 24)),
    8: list(range(0, 24)),
    9: list(range(0, 17))
}
@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

train_columns = [target]
train_columns.extend(feat_columns)
test_columns = feat_columns.copy()
#TrainData = pd.DataFrame(columns= train_columns)
chunk_num = 0
for chunk_df in pd.read_csv('%s/train_sample.csv' % InputDir, 
                            dtype= dtypes, 
                            usecols= train_columns, 
                            parse_dates= time_columns, 
                            iterator=True, 
                            chunksize=1000):
    with timer('Sampling for chunk %s' % chunk_num):
        chunk_df = chunk_df.sample(frac= 0.1)
        chunk_df['hour'] = chunk_df['click_time'].dt.hour
        chunk_df['hour'] = chunk_df['hour'].astype(np.uint8)
        chunk_df['day'] = chunk_df['click_time'].dt.day
        chunk_df['day'] = chunk_df['day'].astype(np.uint8)
        chunk_df.drop('click_time', axis= 1, inplace= True)
        for d in days.keys():
            DayOutputDir = '%s/%s' % (OutputDir, d)
            if(os.path.exists(DayOutputDir) == False):
                os.makedirs(DayOutputDir)
            for h in days[d]:
                chunk_df[(chunk_df['day'] == d) & (chunk_df['hour'] == h)].to_hdf(path_or_buf= '%s/%s.hdf' % (DayOutputDir, h),
                                                                                    key= '%s_%s' % (d, h), 
                                                                                    mode='w', 
                                                                                    complib='blosc')
        chunk_num += 1





chunk 0 done.
chunk 10 done.
chunk 20 done.
chunk 30 done.
chunk 40 done.
chunk 50 done.
chunk 60 done.
chunk 70 done.
chunk 80 done.
chunk 90 done.
[
Sampling for train] done in 75 s
