In [None]:
import pandas as pd
import numpy as np
import os,sys,time
from contextlib import contextmanager
import psutil
import gc
import lightgbm

process = psutil.Process(os.getpid())
@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

DataBaseDir = '../../data'
InputDir = '%s/version1/l0' % DataBaseDir
OutputDir = '%s/version1/l1' % DataBaseDir
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'day'           : 'uint8',
        'hour'          : 'uint8', 
        'click_id'      : 'uint32'
        }
target = 'is_attributed'
features = []
raw_feats = ['ip', 'app', 'device', 'os', 'channel', 'day', 'hour']
cate_feats = raw_feats.copy()
features.extend(raw_feats)
column_combinations = [['ip', 'hour'], ['ip', 'app'], ['ip', 'app', 'os'], ['ip', 'device'], ['app', 'channel']]
count_feats = ['count_%s' % '_'.join(cc) for cc in column_combinations]
features.extend(count_feats)

local_public_hours = [4, 5, 9, 10, 13, 14]
local_private_hours = [i for i in range(24) if(i not in local_public_hours)]
public_hours = [4, 5, 6, 9, 10, 11, 13, 14, 15]
days = {
    6: list(range(14, 24)),
    7: list(range(0, 24)),
    8: list(range(0, 24)),
    9: list(range(0, 17)),
    10: public_hours
}

train_columns = [target]
train_columns.extend(features)
test_columns = ['click_id']
test_columns.extend(features)
local_test_columns = test_columns.copy()
local_test_columns.append(target)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': -1,  
    'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 255,  
    'subsample': 0.7,  
    'subsample_freq': 0, 
    'colsample_bytree': 0.9,  
    'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0,  # L1 regularization term on weights
    'reg_lambda': 0,  # L2 regularization term on weights
    'scale_pos_weight': 99,
    'nthread': 4,
    'verbose': 0
}
#     'learning_rate': 0.15,
#     'num_leaves': 7,  # 2^max_depth - 1
#     'max_depth': 3,  # -1 means no limit
#     'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
#     'max_bin': 100,  # Number of bucketed bin for feature values
#     'subsample': 0.7,  # Subsample ratio of the training instance.
#     'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
#     'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
#     'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
#     'scale_pos_weight':99 # because training data is extremely unbalanced

## train/validation/test
kfold = 3
local_private_auc = .0
local_public_auc = .0
for fold in range(kfold):
    if(fold == (kfold - 1)): # train/validation
        continue
    with timer('Train/Valid for fold %s' % fold):
        # load train data set
        FoldInputDir = '%s/kfold/%s' % (InputDir, fold)
        train_files = [f for f in os.listdir(FoldInputDir) if(f.startswith('train_'))]
        TrainData = pd.DataFrame(columns= train_columns)
        for f in train_files:
            TrainData = ([TrainData, pd.read_hdf(path_or_buf= '%s/%s' % (FoldInputDir, f), key= '%s' % f.split('.')[0])], axis= 1, ignore_index= True)  
        # load private test data set
        private_test_file = [f for f in os.listdir(FoldInputDir) if(f.endswith('_private.hdf'))][0]
        TestData = pd.read_hdf(path_or_buf= '%s/%s' % (FoldInputDir, private_test_file), key= '%s' % private_test_file.split('.')[0])
        # type conversion
        TrainData[target] = TrainData[target].astype('uint8')
        for feat in raw_feats:
            TrainData[feat] = TrainData[feat].astype(dtypes[feat])
            TestData[feat] = TestData[feat].astype(dtypes[feat])
        for feat in count_feats:
            TrainData[feat] = np.log1p(TrainData[feat].astype('uint32')).astype('float32')
            TestData[feat] = np.log1p(TestData[feat].astype('uint32')).astype('float32')
        xgtrain = lightgbm.Dataset(TrainData[features].values, label= TrainData[target].values,
                              feature_name= features,
                              categorical_feature= cate_feats
                              )
        xgvalid = lightgbm.Dataset(TestData[predictors].values, label= TestData[target].values,
                              feature_name= features,
                              categorical_feature= cate_feats
                              )
        # train
        evals_results = {}
        model = lightgbm.train(params, 
                               xgtrain, 
                               valid_sets=[xgtrain, xgvalid], 
                               valid_names=['train','valid'],
                               evals_result= evals_results, 
                               num_boost_round= 500,
                               early_stopping_rounds= 100,
                               verbose_eval=10, 
                               feval= True)
        n_estimators = model.best_iteration
        private_auc = evals_results['valid']['auc'][n_estimators-1]
        local_private_auc += private_auc
        print('epochs %s, auc %.6f' % private_auc)
        # TODO, 2. prediction on train/valid data set
        # TODO, 3. saving predictions on train data set
        del TrainData, TestData
        gc.collect()
#         # load public test data set
#         public_test_file = [f for f in os.listdir(FoldInputDir) if(f.endswith('_public.hdf'))][0]
#         TestData = pd.read_hdf(path_or_buf= '%s/%s' % (FoldInputDir, public_test_file), key= '%s' % public_test_file.split('.')[0])
#         # type conversion
#         for feat in raw_feats:
#             TestData[feat] = TestData[feat].astype(dtypes[feat])
#         for feat in count_feats:
#             TestData[feat] = np.log1p(TestData[feat].astype('uint32')).astype('float32')
#         model.predict(TestData[features])