In [None]:
import pandas as pd
import numpy as np
import os,sys,time
from contextlib import contextmanager
import psutil
import gc
import lightgbm

process = psutil.Process(os.getpid())
@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s\n')
def p_recall(truth, pred_proba):
    assert(len(truth) == len(pred_proba))
    pred_proba = np.array([1  if(v > 0.5) else 0 for v in pred_proba])
    return np.sum(pred_proba)/np.sum(truth)

version_number = 0
strategy = 'meta_lgb'
DataBaseDir = '../../data'
InputDir = '%s/version%s/l0' % (DataBaseDir, version_number)
OutputDir = '%s/version%s/l1' % (DataBaseDir, version_number)
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'day'           : 'uint8',
        'hour'          : 'uint8', 
        'click_id'      : 'uint32'
        }
target = 'is_attributed'
features = []
raw_feats = ['ip', 'app', 'device', 'os', 'channel', 'day', 'hour']
cate_feats = raw_feats.copy()
features.extend(raw_feats)
column_combinations = [['ip', 'hour'], ['ip', 'app'], ['ip', 'app', 'os'], ['ip', 'device'], ['app', 'channel']]
count_feats = ['count_%s' % '_'.join(cc) for cc in column_combinations]
features.extend(count_feats)

local_public_hours = [4, 5, 9, 10, 13, 14]
local_private_hours = [i for i in range(24) if(i not in local_public_hours)]
public_hours = [4, 5, 6, 9, 10, 11, 13, 14, 15]
days = {
    6: list(range(14, 24)),
    7: list(range(0, 24)),
    8: list(range(0, 24)),
    9: list(range(0, 17)),
    10: public_hours
}

train_columns = [target]
train_columns.extend(features)
test_columns = ['click_id']
test_columns.extend(features)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.04,
    'num_leaves': 31,
    'max_depth': -1, 
    'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 255,
    'subsample': 0.7,  
    'subsample_freq': 0, 
    'colsample_bytree': 0.9,  
    'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0,  # L1 regularization term on weights
    'reg_lambda': 0,  # L2 regularization term on weights
    'scale_pos_weight': 99,
    'nthread': 4,
    'verbose': 0
}
#     'learning_rate': 0.15,
#     'num_leaves': 7,  # 2^max_depth - 1
#     'max_depth': 3,  # -1 means no limit
#     'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
#     'max_bin': 100,  # Number of bucketed bin for feature values
#     'subsample': 0.7,  # Subsample ratio of the training instance.
#     'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
#     'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
#     'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
#     'scale_pos_weight':99 # because training data is extremely unbalanced

## train/validation/test
kfold = 3
local_private_auc = .0
local_public_auc = .0
local_private_recall = .0
local_public_recall = .0
last_n_estimators = -1
for fold in range(kfold):
#     if(fold == (kfold - 1)): # train/validation
#         continue
    with timer('Train/Valid/Test for fold %s' % fold):
        DataSet = {}
        DataSet['train'] = pd.DataFrame(columns= train_columns)
        if(fold < kfold - 1):
            DataSet['public_test'] = pd.DataFrame(columns= train_columns)
            DataSet['private_test'] = pd.DataFrame(columns= train_columns)
        else:
            DataSet['public_test'] = pd.DataFrame(columns= test_columns)
        # load train set
        FoldInputDir = '%s/kfold/%s' % (InputDir, fold)
        train_files = [f for f in os.listdir(FoldInputDir) if(f.startswith('train'))]
        for f in train_files:
            DataSet['train'] = pd.concat([DataSet['train'], pd.read_hdf(path_or_buf= '%s/%s' % (FoldInputDir, f), key= '%s' % f.split('.')[0])], axis= 0, ignore_index= True)  
        # load local private test set
        if(fold < kfold - 1):
            private_test_file = [f for f in os.listdir(FoldInputDir) if(f.endswith('private.hdf'))][0]
            DataSet['private_test'] = pd.read_hdf(path_or_buf= '%s/%s' % (FoldInputDir, private_test_file), key= '%s' % private_test_file.split('.')[0])
        public_test_file = [f for f in os.listdir(FoldInputDir) if(f.endswith('public.hdf'))][0]
        DataSet['public_test'] = pd.read_hdf(path_or_buf= '%s/%s' % (FoldInputDir, public_test_file), key= '%s' % public_test_file.split('.')[0])
        # type conversion
        for mod in ['train', 'public_test', 'private_test']:
            if((fold == kfold - 1) & (mod == 'private_test')):
                continue
            if(fold < kfold - 1):
                DataSet[mod][target] = DataSet[mod][target].astype('uint8')
            for feat in raw_feats:
                DataSet[mod][feat] = DataSet[mod][feat].astype(dtypes[feat])
            for feat in count_feats:
                DataSet[mod][feat] = np.log1p(DataSet[mod][feat].astype('uint32')).astype('float32')
            if((fold == kfold - 1) & (mod == 'public_test')):
                DataSet['public_test']['click_id'] = DataSet['public_test']['click_id'].astype('uint32')
        if(fold == kfold - 1):
            print('train size %sM, public test size %sM' % (int(len(DataSet['train'])/1e6), int(len(DataSet['public_test'])/1e6)))
        else:
            print('train size %sM, public test size %sM, private test size %sM' % (int(len(DataSet['train'])/1e6), 
                                                                                   int(len(DataSet['public_test'])/1e6), 
                                                                                   int(len(DataSet['private_test'])/1e6)))
        print('\n==== Memory usage %sM ====\n'% ((int(process.memory_info().rss/1e6))))
        # train
        xgtrain = lightgbm.Dataset(DataSet['train'][features].values, 
                                   label= DataSet['train'][target].values,
                                   feature_name= features,
                                   categorical_feature= cate_feats
                                  )
        if(fold < kfold - 1):
            evals_results = {}
            xgvalid = lightgbm.Dataset(DataSet['public_test'][features].values, 
                                       label= DataSet['public_test'][target].values,
                                       feature_name= features,
                                       categorical_feature= cate_feats
                                      )
            model = lightgbm.train(params, 
                                   xgtrain, 
                                   valid_sets=[xgtrain, xgvalid], 
                                   valid_names=['train','valid'],
                                   evals_result= evals_results, 
                                   num_boost_round= 500,
                                   early_stopping_rounds= 100,
                                   verbose_eval= 25,
                                   feval= None)
            n_estimators = model.best_iteration
            last_n_estimators = n_estimators
            public_auc = evals_results['valid']['auc'][n_estimators-1]
            local_public_auc += public_auc 
        else:
            model = lightgbm.train(params, xgtrain, num_boost_round= last_n_estimators)
        print('\n==== Memory usage %sM ====\n'% ((int(process.memory_info().rss/1e6))))
        # prediction on train/test data set
        for mod in ['train', 'public_test', 'private_test']:
            if((fold == kfold - 1) & (mod == 'private_test')):
                continue
            DataSet[mod][strategy] = model.predict(DataSet[mod][features])
            DataSet[mod][strategy] = DataSet[mod][strategy].astype('float32')
        if(fold < kfold - 1):
            private_recall = p_recall(DataSet['private_test'][strategy].values, DataSet['private_test'][target])
            local_private_recall += private_recall
            public_recall = p_recall(DataSet['public_test'][strategy].values, DataSet['public_test'][target])
            local_public_recall += public_recall
        if(fold < kfold - 1):
            print('train for fold %s done, epochs %s, auc %.6f, public recall %.6f(%s), private recall %.6f(%s)' % (fold, 
                                             n_estimators, 
                                             public_auc, 
                                             public_recall, 
                                             np.sum(DataSet['public_test'][target].values), 
                                             private_recall,
                                             np.sum(DataSet['private_test'][target].values)
                                            )
                 )
        else:
            print('train for fold %s done, epochs %s, average auc %.6f, average public recall %.6f, average private recall %.6f' % (fold, 
                                                                              last_n_estimators, 
                                                                              local_public_auc/(kfold - 1), 
                                                                              local_public_recall/(kfold - 1), 
                                                                              local_private_recall/(kfold - 1)))
        FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        for mod in ['train', 'public_test', 'private_test']:
            if((fold == kfold - 1) & (mod == 'private_test')):
                continue
            if((fold < kfold - 1) | (mod == 'train')):
                DataSet[mod] = DataSet[mod][['day', strategy, target]]
            else:
                DataSet[mod] = DataSet[mod][['click_id', 'day', strategy]]
                DataSet[mod].rename(columns = {strategy: target}, inplace= True)
            days = list(DataSet[mod]['day'].unique())
            for d in days:
                k = '%s_%s_%s' % (mod, d, strategy)
                if(fold < kfold - 1):
                    DataSet[mod][DataSet[mod]['day'] == d].to_csv('%s/%s.csv' % (FoldOutputDir, k), float_format='%.8f', index=False)
                else:
                    DataSet[mod][DataSet[mod]['day'] == d].drop(['day'], axis=1).to_csv('%s/%s.csv' % (FoldOutputDir, k), float_format='%.8f', index=False)
        del DataSet
        gc.collect()
        print('\n==== Memory usage %sM ====\n'% ((int(process.memory_info().rss/1e6))))

In [None]:
## compress for submition
SubmitDir = '%s/submit' % OutputDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
filename = 'public_test_10_%s' % strategy
print('zip -ur  %s/%s.zip %s/kfold/%s/%s.csv' % (SubmitDir, filename, OutputDir, kfold - 1, filename))
os.system('zip -ur  %s/%s.zip %s/kfold/%s/%s.csv' % (SubmitDir, filename, OutputDir, kfold - 1, filename))