In [1]:
import warnings
warnings.filterwarnings("ignore")

import re
import gc
import datetime
import glob
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import implicit
import lightgbm as lgb

import wordbatch
from wordbatch.extractors import WordBag, WordHash
from wordbatch.models import FM_FTRL, FTRL

In [3]:
def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [4]:
dir_path = '/disk/Tbrain/'

train_set = pd.read_csv(dir_path+'training-set.csv', header=None, names=['FileID', 'Target'])
test_set = pd.read_csv(dir_path+'testing-set.csv', header=None, names=['FileID', 'Target'])
train_ex = pd.read_table(dir_path+'exception/exception_train.txt', header=None, names=['FileID'])
test_ex = pd.read_table(dir_path+'exception/exception_testing.txt', header=None, names=['FileID'])

train_set = train_set.loc[~train_set['FileID'].isin(train_ex)]
test_set = test_set.loc[~test_set['FileID'].isin(test_ex)]


log_data = pd.read_csv(dir_path+'log_data.csv')
log_data['QueryTS'] = pd.to_datetime(log_data['QueryTS'], format='%Y-%m-%d %H:%M:%S')
log_data.sort_values(['QueryTS'], ascending=True, na_position='first', inplace=True)
log_data.reset_index(drop=True, inplace=True)
log_data['ProductID'] = log_data['ProductID'].astype(str)
print('Load data complete.')

del train_ex, test_ex
gc.collect()

log_data.head()

Load data complete.


Unnamed: 0,FileID,CustomerID,QueryTS,ProductID
0,77043f410ba7dc8dced745db94a1fcba,02142c5dccec42262a1e88dc8416f5b7,2017-03-01,634e6b
1,c5ad385cba5ac43b0263f3d804c8f823,8d7a1059ef78672047e45cf1b84d6276,2017-03-01,7acab3
2,c5ad385cba5ac43b0263f3d804c8f823,8d7a1059ef78672047e45cf1b84d6276,2017-03-01,7acab3
3,75d7a7507ac89b33191b18d56af7544b,8d7a1059ef78672047e45cf1b84d6276,2017-03-01,7acab3
4,75d7a7507ac89b33191b18d56af7544b,8d7a1059ef78672047e45cf1b84d6276,2017-03-01,7acab3


In [5]:
data = pd.concat([train_set, test_set],axis=0)
data['Target'].replace(0.5, np.nan, inplace=True)

data.head()

Unnamed: 0,FileID,Target
0,0000e2398b12121a85166fed5fe2a3da,0.0
1,0001fe8dce14ce099aa6ca8ea5026ea7,0.0
2,00027f50019000accc492e5684efc818,0.0
3,00028c9da3573ec50db74b44310ae507,0.0
4,0003dc8130969abe688cadf5f14ea19f,0.0


### Magic Features
CAUTION!! Related with `Target`, take care with dealing overfitting

In [6]:
data = pd.concat([train_set, test_set],axis=0)
data['Target'].replace(0.5, np.nan, inplace=True)
nrow_train = len(train_set)

# Given fold number for each FileID (Need for feature engineering with y, prevent for overfitting)
n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, random_state=5566)
data['Fold'] = np.nan
for f, (_, valid_idx) in enumerate(skf.split(data['FileID'].iloc[:nrow_train], data['Target'].iloc[:nrow_train])):
    data['Fold'].iloc[valid_idx] = f

log_data = pd.merge(log_data, data, on='FileID', how='left')    

data.head()

Unnamed: 0,FileID,Target,Fold
0,0000e2398b12121a85166fed5fe2a3da,0.0,0.0
1,0001fe8dce14ce099aa6ca8ea5026ea7,0.0,0.0
2,00027f50019000accc492e5684efc818,0.0,0.0
3,00028c9da3573ec50db74b44310ae507,0.0,0.0
4,0003dc8130969abe688cadf5f14ea19f,0.0,0.0


In [1]:
'CustomerID'[:4]

'Cust'

In [7]:
CustomerID_y = log_data[(log_data['Fold'] != 0) & (log_data['Fold'].notnull())].groupby('CustomerID')['Target'].agg({'TargetCust_mean': np.mean, 
                                                                                                                     'TargetCust_std': np.std, 
                                                                                                                     'TargetCust_sum': np.sum}).reset_index()
CustomerID_y = CustomerID_y[CustomerID_y['CustomerID'].isin(list(log_data[log_data['Fold'] == 0]['CustomerID']))]
CustomerID_y['Fold'] = 0

for fold in range(1, n_folds):
    tmp = log_data[(log_data['Fold'] != fold) & (log_data['Fold'].notnull())].groupby('CustomerID')['Target'].agg({'TargetCust_mean': np.mean, 
                                                                                                                   'TargetCust_std': np.std, 
                                                                                                                   'TargetCust_sum': np.sum}).reset_index()
    tmp = tmp[tmp['CustomerID'].isin(list(log_data[log_data['Fold'] == fold]['CustomerID']))]
    tmp['Fold'] = fold
    CustomerID_y = CustomerID_y.append(tmp)

tmp = log_data[log_data['Fold'].notnull()].groupby('CustomerID')['Target'].agg({'TargetCust_mean': np.mean, 
                                                                                'TargetCust_std': np.std, 
                                                                                'TargetCust_sum': np.sum}).reset_index()
tmp = tmp[tmp['CustomerID'].isin(list(log_data[log_data['Fold'].isnull()]['CustomerID']))]
tmp['Fold'] = np.nan
CustomerID_y = CustomerID_y.append(tmp)
print('Group by CustomerID completed.')



Group by CustomerID completed.


In [8]:
ProductID_y = log_data[(log_data['Fold'] != 0) & (log_data['Fold'].notnull())].groupby('ProductID')['Target'].agg({'TargetProd_mean': np.mean, 
                                                                                                                   'TargetProd_std': np.std, 
                                                                                                                   'TargetProd_sum': np.sum}).reset_index()
ProductID_y = ProductID_y[ProductID_y['ProductID'].isin(list(log_data[log_data['Fold'] == 0]['ProductID']))]
ProductID_y['Fold'] = 0

for fold in range(1, n_folds):
    tmp = log_data[(log_data['Fold'] != fold) & (log_data['Fold'].notnull())].groupby('ProductID')['Target'].agg({'TargetProd_mean': np.mean, 
                                                                                                                  'TargetProd_std': np.std, 
                                                                                                                  'TargetProd_sum': np.sum}).reset_index()
    tmp = tmp[tmp['ProductID'].isin(list(log_data[log_data['Fold'] == fold]['ProductID']))]
    tmp['Fold'] = fold
    ProductID_y = ProductID_y.append(tmp)

tmp = log_data[log_data['Fold'].notnull()].groupby('ProductID')['Target'].agg({'TargetProd_mean': np.mean, 
                                                                               'TargetProd_std': np.std, 
                                                                               'TargetProd_sum': np.sum}).reset_index()
tmp = tmp[tmp['ProductID'].isin(list(log_data[log_data['Fold'].isnull()]['ProductID']))]
tmp['Fold'] = np.nan
ProductID_y = ProductID_y.append(tmp)
print('Group by ProductID completed.')

Group by ProductID completed.


In [9]:
log_data = pd.merge(log_data, CustomerID_y, on=['CustomerID', 'Fold'], how='left')
log_data = pd.merge(log_data, ProductID_y, on=['ProductID', 'Fold'], how='left')

# for col in ['TargetCust_sum', 'TargetProd_sum']:
#     log_data[col] = (log_data[col]-np.mean(log_data[col])) / (np.std(log_data[col])+np.mean(log_data[col])+1)

# for col in ['TargetCust_mean', 'TargetCust_std', 'TargetCust_sum', 'TargetProd_mean', 'TargetProd_std', 'TargetProd_sum']:
#     log_data[col].fillna(-1, inplace=True)

del CustomerID_y, ProductID_y
gc.collect()

42

In [10]:
target_feature = pd.DataFrame()
cols_to_use = ['TargetCust_mean', 'TargetCust_std', 'TargetCust_sum', 'TargetProd_mean', 'TargetProd_std', 
               'TargetProd_sum']

for fold in range(n_folds):
    tmp_all = log_data.loc[log_data['Fold'] == fold].groupby('FileID')[cols_to_use].agg({'_mean': np.mean, '_std': np.std, 
                                                                                         '_min': np.min, '_max': np.max, 
                                                                                         '_median': np.median, 
                                                                                         '_10per': lambda x: np.percentile(x, q=10), 
                                                                                         '_25per': lambda x: np.percentile(x, q=25),
                                                                                         '_75per': lambda x: np.percentile(x, q=75),
                                                                                         '_90per': lambda x: np.percentile(x, q=90)})
    tmp_4 = pd.DataFrame()
    for col in ['_mean', '_std', '_min', '_median', '_max', '_10per', '_25per', '_75per', '_90per']:
        tmp = tmp_all[col]
        tmp.columns = [sub_col+col for sub_col in tmp.columns]
        tmp = tmp.reset_index()
        tmp_4 = pd.concat([tmp_4, tmp], axis=1)
        
    target_feature = pd.concat([target_feature, tmp_4], axis=0)

tmp_all = log_data.loc[log_data['Fold'].isnull()].groupby('FileID')[cols_to_use].agg({'_mean': np.mean, '_std': np.std, 
                                                                                      '_min': np.min, '_max': np.max,
                                                                                      '_median': np.median, 
                                                                                      '_10per': lambda x: np.percentile(x, q=10), 
                                                                                      '_25per': lambda x: np.percentile(x, q=25),
                                                                                      '_75per': lambda x: np.percentile(x, q=75),
                                                                                      '_90per': lambda x: np.percentile(x, q=90)})
tmp_4 = pd.DataFrame()
for col in ['_mean', '_std', '_min', '_median', '_max', '_10per', '_25per', '_75per', '_90per']:
    tmp = tmp_all[col]
    tmp.columns = [sub_col+col for sub_col in tmp.columns]
    tmp = tmp.reset_index()
    tmp_4 = pd.concat([tmp_4, tmp], axis=1)

target_feature = pd.concat([target_feature, tmp_4], axis=0)

tmp = pd.Series(target_feature.iloc[:,0], name='FileID').to_frame()
target_feature.drop(['FileID'], axis=True, inplace=True)
target_feature = pd.concat([tmp, target_feature], axis=1)
target_feature.fillna(-1, inplace=True)
print('Shape of features related with`Target`: {0}'.format(target_feature.shape))

data = pd.merge(data, target_feature, on='FileID', how='left')

# Drop columns only useful for magic features.
log_data.drop(log_data.columns.tolist()[5:], axis=1, inplace=True)
del target_feature
gc.collect()

Shape of features related with`Target`: (81894, 55)


1499

### User-Item Matrix

In [None]:
log_data['FileID'] = log_data['FileID'].astype('category', categories=data['FileID'].values)
log_data['CustomerID'] = log_data['CustomerID'].astype('category')

row = log_data['FileID'].cat.codes
col = log_data['CustomerID'].cat.codes

dtm = csr_matrix((np.ones(len(log_data)), (row, col)))
dtm = dtm[:, np.where(dtm.getnnz(axis=0) > 1)[0]]
print('Shape of DTM: {0}'.format(dtm.shape))

In [None]:
# Use Alternating Least Square to reduce dimension (latent factor)
als = implicit.als.AlternatingLeastSquares(factors=350)
als.fit(dtm)
X_als = csr_matrix(als.item_factors)

del dtm
gc.collect()

print('Extract ALS latent factor completed.')

### Document Term Matrix

In [None]:
# Get Time Series of CustomerID and ProductID
log_data['CustomerID_le'] = row
log_data['ProductID_le'] = col
# Sequence of CustomerID
tmp = log_data.groupby('FileID')['CustomerID_le'].apply(list)
data = pd.merge(data, tmp.to_frame().reset_index(), how='left', on='FileID')
# Sequence of ProductID
tmp = log_data.groupby('FileID')['ProductID_le'].apply(list)
data = pd.merge(data, tmp.to_frame().reset_index(), how='left', on='FileID')
log_data.drop(['CustomerID_le', 'ProductID_le'], axis=1, inplace=True)
print('Sequentail lebal encoding completed.')

In [None]:
def add_ngram(q, n_gram_max):
        ngrams = []
        for n in range(2, n_gram_max+1):
            for w_index in range(len(q)-n+1):
                ngrams.append(''.join(q[w_index:w_index+n]))
        return q + ngrams

def list2str(text):
    return u' '.join([str(x) for x in text])


wb = wordbatch.WordBatch(list2str, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], 
                                                        "hash_size": 2 ** 29, "norm": "l2", "tf": 'binary',
                                                        "idf": None}), procs=8)
wb.dictionary_freeze= True
X_cust = wb.fit_transform(data['CustomerID_le'])
X_cust = X_cust[:, np.where(X_cust.getnnz(axis=0) > 1)[0]]
print('Shape of X_cust: {0}'.format(X_cust.shape))
del wb


print('Vectorize `CustomerID` completed.')

wb = wordbatch.WordBatch(list2str, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], 
                                                        "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                        "idf": None}), procs=8)
wb.dictionary_freeze= True
X_prod = wb.fit_transform(data['ProductID_le'])
X_prod = X_prod[:, np.where(X_prod.getnnz(axis=0) > 10)[0]]
print('Shape of X_prod: {0}'.format(X_prod.shape))
# Fit ALS
als = implicit.als.AlternatingLeastSquares(factors=100)
als.fit(X_prod)
X_prod = csr_matrix(als.item_factors)
del wb, als


data.drop(['CustomerID_le', 'ProductID_le'], axis=1, inplace=True)

print('Vectorize `ProductID` completed.')

In [11]:
# Sequence of FileDiff
log_data['FileDiff'] = log_data['QueryTS'].diff().dt.total_seconds()
log_data['FileDiffgFile'] = log_data.groupby('FileID')['QueryTS'].diff().dt.total_seconds()
log_data['FileDiffgCust'] = log_data.groupby('CustomerID')['QueryTS'].diff().dt.total_seconds()
log_data['FileDiffgProd'] = log_data.groupby('ProductID')['QueryTS'].diff().dt.total_seconds()

# FileDiff
tmp = log_data.groupby('FileID')['FileDiff'].apply(list)
data = pd.merge(data, tmp.to_frame().reset_index(), how='left', on='FileID')
# FileDiffgFile
tmp = log_data.groupby('FileID')['FileDiffgFile'].apply(list)
data = pd.merge(data, tmp.to_frame().reset_index(), how='left', on='FileID')
# FileDiffgCust
tmp = log_data.groupby('FileID')['FileDiffgCust'].apply(list)
data = pd.merge(data, tmp.to_frame().reset_index(), how='left', on='FileID')
# FileDiffgProd
tmp = log_data.groupby('FileID')['FileDiffgProd'].apply(list)
data = pd.merge(data, tmp.to_frame().reset_index(), how='left', on='FileID')
print('Generate list of time delta sequence complete.')

Generate list of time delta sequence complete.


In [None]:
wb = wordbatch.WordBatch(list2str, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], 
                                                        "hash_size": 2 ** 28, "norm": "l2", "tf": 'binary',
                                                        "idf": None}), procs=24)
wb.dictionary_freeze= True
X_diff = wb.fit_transform(data['FileDiff'])
X_diff = X_diff[:, np.where(X_diff.getnnz(axis=0) > 1)[0]]
print('Shape of X_cust: {0}'.format(X_diff.shape))
del wb

wb = wordbatch.WordBatch(list2str, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], 
                                                        "hash_size": 2 ** 28, "norm": "l2", "tf": 'binary',
                                                        "idf": None}), procs=24)
wb.dictionary_freeze= True
tmp = wb.fit_transform(data['FileDiffgFile'])
tmp = tmp[:, np.where(tmp.getnnz(axis=0) > 5)[0]]
print('Shape of X_prod: {0}'.format(tmp.shape))
# Fit ALS
als = implicit.als.AlternatingLeastSquares(factors=300)
als.fit(tmp)
tmp = csr_matrix(als.item_factors)

del wb, als
X_diff = hstack((X_diff, tmp)).tocsr()


wb = wordbatch.WordBatch(list2str, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], 
                                                        "hash_size": 2 ** 28, "norm": "l2", "tf": 'binary',
                                                        "idf": None}), procs=24)
wb.dictionary_freeze= True
tmp = wb.fit_transform(data['FileDiffgCust'])
tmp = tmp[:, np.where(tmp.getnnz(axis=0) > 5)[0]]
print('Shape of X_prod: {0}'.format(tmp.shape))
# Fit ALS
als = implicit.als.AlternatingLeastSquares(factors=50)
als.fit(tmp)
tmp = csr_matrix(als.item_factors)

del wb, als
X_diff = hstack((X_diff, tmp)).tocsr()

wb = wordbatch.WordBatch(list2str, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], 
                                                        "hash_size": 2 ** 28, "norm": "l2", "tf": 'binary',
                                                        "idf": None}), procs=24)
wb.dictionary_freeze= True
tmp = wb.fit_transform(data['FileDiffgProd'])
tmp = tmp[:, np.where(tmp.getnnz(axis=0) > 1)[0]]
print('Shape of X_prod: {0}'.format(tmp.shape))
# Fit ALS
als = implicit.als.AlternatingLeastSquares(factors=50)
als.fit(tmp)
tmp = csr_matrix(als.item_factors)

del wb, als
X_diff = hstack((X_diff, tmp)).tocsr()

data.drop(['FileDiff', 'FileDiffgFile', 'FileDiffgCust', 'FileDiffgProd'], axis=1, inplace=True)
del tmp
gc.collect()
print('Vectorize `Time Difference` completed.')

### Counting features

In [12]:
# Numbers of FileID
tmp = log_data.groupby('FileID').apply(len)
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'FileCount']
data = pd.merge(data, tmp, how='left', on='FileID')
# Numbers of unique CustomerID
tmp = log_data.groupby('FileID')['CustomerID'].apply(lambda x: len(np.unique(x)))
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'CustCount']
data = pd.merge(data, tmp, how='left', on='FileID')
# Numbers of unique ProductID
tmp = log_data.groupby('FileID')['ProductID'].apply(lambda x: len(np.unique(x)))
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'ProdCount']
data = pd.merge(data, tmp, how='left', on='FileID')
print('Count FileID completed.')

# Numbers of idiot countings from FileId and ProductId
tmp = log_data.groupby('FileID')['CustomerID'].apply(lambda x: len(''.join(x)))
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'DigfCustLen']
data = pd.merge(data, tmp, how='left', on='FileID')
tmp = log_data.groupby('FileID')['CustomerID'].apply(lambda x: sum(c.isdigit() for c in ''.join(x))/len(''.join(x)))
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'DigfCustProp']
data = pd.merge(data, tmp, how='left', on='FileID')
tmp = log_data.groupby('FileID')['ProductID'].apply(lambda x: len(''.join(x)))
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'DigfProdLen']
data = pd.merge(data, tmp, how='left', on='FileID')
tmp = log_data.groupby('FileID')['ProductID'].apply(lambda x: sum(c.isdigit() for c in ''.join(x))/len(''.join(x)))
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'DigfProdProp']
data = pd.merge(data, tmp, how='left', on='FileID')

# Idiot counting
data['NumDig'] = data['FileID'].apply(lambda x: sum(c.isdigit() for c in x))
data['NumA'] = data['FileID'].apply(lambda x: x.count('a'))
data['NumB'] = data['FileID'].apply(lambda x: x.count('b'))
data['NumC'] = data['FileID'].apply(lambda x: x.count('c'))
data['NumD'] = data['FileID'].apply(lambda x: x.count('d'))
data['NumE'] = data['FileID'].apply(lambda x: x.count('e'))
print('Idiot count completed.')

Count FileID completed.
Idiot count completed.


In [13]:
# Date preprocessing
log_data['Month'] = log_data['QueryTS'].dt.month
log_data['Day'] = log_data['QueryTS'].dt.day
log_data['Hour'] = log_data['QueryTS'].dt.hour
log_data['Minute'] = log_data['QueryTS'].dt.minute
log_data['Second'] = log_data['QueryTS'].dt.second
log_data['WoY'] = log_data['QueryTS'].dt.weekofyear
log_data['DoW'] = log_data['QueryTS'].dt.dayofweek
log_data['DoY'] = log_data['QueryTS'].dt.dayofyear

In [17]:
# Dealing with datetime
cols_name = ['Month', 'Day', 'Hour', 'Minute', 'Second', 'WoY', 'DoW', 'DoY']
# Mean
tmp = log_data.groupby('FileID')[cols_name].agg(np.mean)
tmp.columns = [col+'_mean' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# Ten percentile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=10))
tmp.columns = [col+'_10per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# First Quartile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=25))
tmp.columns = [col+'_25per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# Third Quartile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=75))
tmp.columns = [col+'_75per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# Ninety percentile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=90))
tmp.columns = [col+'_90per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# Median
tmp = log_data.groupby('FileID')[cols_name].agg(np.median)
tmp.columns = [col+'_median' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# Minimum
tmp = log_data.groupby('FileID')[cols_name].agg(np.min)
tmp.columns = [col+'_min' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# Maximum
tmp = log_data.groupby('FileID')[cols_name].agg(np.max)
tmp.columns = [col+'_max' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
# standard deviation
tmp = log_data.groupby('FileID')[cols_name].agg(np.std)
tmp.columns = [col+'_std' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp, how='left', on='FileID')
print('Generate datetime features completed.')

Generate datetime features completed.


In [19]:
def dt_percentile(x, p):
    index = range(len(x))
    return x.iloc[np.int((np.percentile(index, p)))]    

latest_date = log_data['QueryTS'].iloc[-1]

# Minimum
tmp = log_data.groupby('FileID')['QueryTS'].agg(lambda x: latest_date - np.min(x)).dt.total_seconds()
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'Duration_min']
data = pd.merge(data, tmp.reset_index(drop=True), how='left', on='FileID')
# Ten percentile
tmp = log_data.groupby('FileID')['QueryTS'].agg(lambda x: latest_date - dt_percentile(x, 10)).dt.total_seconds()
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'Duration_10per']
data = pd.merge(data, tmp.reset_index(drop=True), how='left', on='FileID')
# First Quantile
tmp = log_data.groupby('FileID')['QueryTS'].agg(lambda x: latest_date - dt_percentile(x, 25)).dt.total_seconds()
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'Duration_25per']
data = pd.merge(data, tmp.reset_index(drop=True), how='left', on='FileID')
# Median
tmp = log_data.groupby('FileID')['QueryTS'].agg(lambda x: latest_date - dt_percentile(x, 50)).dt.total_seconds()
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'Duration_median']
data = pd.merge(data, tmp.reset_index(drop=True), how='left', on='FileID')
# Third Quantile
tmp = log_data.groupby('FileID')['QueryTS'].agg(lambda x: latest_date - dt_percentile(x, 75)).dt.total_seconds()
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'Duration_75per']
data = pd.merge(data, tmp.reset_index(drop=True), how='left', on='FileID')
# Ninety percentile
tmp = log_data.groupby('FileID')['QueryTS'].agg(lambda x: latest_date - dt_percentile(x, 90)).dt.total_seconds()
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'Duration_90per']
data = pd.merge(data, tmp.reset_index(drop=True), how='left', on='FileID')
# Maximum
tmp = log_data.groupby('FileID')['QueryTS'].agg(lambda x: latest_date - np.max(x)).dt.total_seconds()
tmp = tmp.to_frame().reset_index()
tmp.columns = ['FileID', 'Duration_max']
data = pd.merge(data, tmp.reset_index(drop=True), how='left', on='FileID')
# Max - Min
data['Duration_range'] = data['Duration_min'] - data['Duration_max']

print('Generate difference datetime features completed.')

Generate difference datetime features completed.


In [21]:
# Count the difference time between each file was scaned
cols_name = ['FileDiff', 'FileDiffgFile', 'FileDiffgCust', 'FileDiffgProd']
log_data[cols_name] = log_data[cols_name].fillna(-1)
# Mean
tmp = log_data.groupby('FileID')[cols_name].agg(np.mean)
tmp.columns = [col+'_mean' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# Ten percentile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=10))
tmp.columns = [col+'_10per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# First Quartile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=25))
tmp.columns = [col+'_25per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# Third Quartile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=75))
tmp.columns = [col+'_75per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# Ninety percentile
tmp = log_data.groupby('FileID')[cols_name].agg(lambda x: np.percentile(x, q=90))
tmp.columns = [col+'_90per' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# Median
tmp = log_data.groupby('FileID')[cols_name].agg(np.median)
tmp.columns = [col+'_median' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# Minimum
tmp = log_data.groupby('FileID')[cols_name].agg(np.min)
tmp.columns = [col+'_min' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# Maximum
tmp = log_data.groupby('FileID')[cols_name].agg(np.max)
tmp.columns = [col+'_max' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
# standard deviation
tmp = log_data.groupby('FileID')[cols_name].agg(np.std)
tmp.columns = [col+'_std' for col in tmp.columns]
tmp.reset_index(inplace=True)
data = pd.merge(data, tmp.reset_index(), how='left', on='FileID')
print('Datetime groupby completed.')

Datetime groupby completed.


In [7]:
# Save or Load?
if False:
    save_sparse_csr('/disk/albert/Top1/als_{0}.npz'.format('v2'), X_als)
    save_sparse_csr('/disk/albert/Top1/cust_{0}.npz'.format('v2'), X_cust)
    save_sparse_csr('/disk/albert/Top1/prod_{0}.npz'.format('v2'), X_prod)
    save_sparse_csr('/disk/albert/Top1/diff_{0}.npz'.format('v2'), X_diff)

if True:
    X_als = load_sparse_csr('/disk/albert/Top1/als_v2.npz')
    X_cust = load_sparse_csr('/disk/albert/Top1/cust_v2.npz')
    X_prod = load_sparse_csr('/disk/albert/Top1/prod_v2.npz')
    X_diff = load_sparse_csr('/disk/albert/Top1/diff_v2.npz')
    
y = data['Target']
cv_folds = data['Fold']
cv_folds.dropna(inplace=True)
y = y.dropna()

In [25]:
y = data['Target']
cv_folds = data['Fold']
cv_folds.dropna(inplace=True)
y = y.dropna()
data.drop(['FileID', 'Target', 'Fold', 'index_x', 'index_y', 'index'], axis=1, inplace=True)
tmp = data.std()
col_std0 = tmp[tmp.isnull()].index
data.drop(col_std0, axis=1, inplace=True)

In [8]:
data = pd.read_csv('./backup.csv')

In [9]:
data.head() 

Unnamed: 0,TargetCust_mean_mean,TargetCust_std_mean,TargetCust_sum_mean,TargetProd_mean_mean,TargetProd_std_mean,TargetProd_sum_mean,TargetCust_mean_std,TargetCust_std_std,TargetCust_sum_std,TargetProd_mean_std,...,FileDiffgCust_min,FileDiffgProd_min,FileDiff_max,FileDiffgFile_max,FileDiffgCust_max,FileDiffgProd_max,FileDiff_std,FileDiffgFile_std,FileDiffgCust_std,FileDiffgProd_std
0,0.0,0.0,0.0,0.036583,0.187681,297715.297872,0.0,0.0,0.0,0.001691,...,-1.0,0.0,1.0,287902.0,287902.0,3.0,0.397727,42758.474555,41977.666485,0.706125
1,0.000215,0.001605,0.017094,0.017495,0.127007,102680.264957,0.002068,0.014574,0.159554,0.009832,...,-1.0,0.0,1.0,22908.0,905563.0,26.0,0.325257,2883.826921,82068.297495,2.032187
2,0.084151,0.162664,84610.5,0.121472,0.298766,524881.81746,0.107129,0.202659,107714.216372,0.080458,...,-1.0,0.0,3.0,56130.0,2965080.0,24.0,0.450269,5353.024683,209513.933912,4.161094
3,0.03889,0.073787,38485.081522,0.066022,0.219548,893258.858696,0.083666,0.158205,83990.385228,0.071581,...,-1.0,0.0,1.0,80033.0,2791885.0,35.0,0.227011,6874.729023,369618.759118,4.147249
4,0.036833,0.072184,36560.821739,0.063297,0.21653,836606.008511,0.081888,0.156051,82360.790847,0.069086,...,-1.0,0.0,2.0,62842.0,2774995.0,21.0,0.472868,6694.117995,620568.905081,2.691969


In [10]:
tmp = data.std()
data.drop(tmp[tmp<=0].index, axis=1, inplace=True)

In [17]:
data.drop(['FileDiff', 'FileDiffgFile', 'FileDiffgCust', 'FileDiffgProd'], axis=1, inplace=True)

In [19]:
data = data.apply(lambda x: (x-np.nanmean(x))/(np.nanstd(x)+np.nanmean(x)+1))
data.fillna(-1, inplace=True)
X_data = csr_matrix(data)
X_data = hstack((X_data, X_als, X_cust, X_prod, X_diff)).tocsr()

In [None]:
y = data['Target']
cv_folds = data['Fold']
cv_folds.dropna(inplace=True)
y = y.dropna()
data.drop(['FileID', 'Target', 'Fold'], axis=1, inplace=True)
data.fillna(-1, inplace=True)
data = data.apply(lambda x: (x-np.mean(x))/(np.std(x)+np.mean(x)+1))
X_data = csr_matrix(data)
save_sparse_csr('/disk/albert/Top1/data_{0}.npz'.format('v3'), X_data)
X_data = hstack((X_data, X_als, X_cust, X_prod, X_diff)).tocsr()
print('Shape of X_data: {0}'.format(X_data.shape))

In [23]:
save_sparse_csr('/disk/albert/Top1/new_process_{0}.npz'.format('v3'), X_data)
del X_als, X_cust, X_prod, X_diff
gc.collect()

3001

### Submit Final bagging answer

In [20]:
params = {
    'learning_rate': 0.01,
    'application': 'binary',
    'metric': 'auc',
    'is_unbalance': True,
    
    'bagging_fraction': 0.75,
    'bagging_freq': 3,
    'feature_fraction': 0.66,
    'max_depth': 10,
    'min_data_in_leaf': 50,
    'num_leaves': 87,

    'verbosity': -1,
    'data_random_seed': 1,
    'max_bin': 64,
    'nthread': 18
}

In [21]:
# Reset data
train_X = X_data[:len(train_set)]
test_X = X_data[len(train_set):]
train_y = y[:len(train_set)]

n_bags = 5
predsL = 0

for fold in range(n_folds):
    train_idx, valid_idx = cv_folds.loc[cv_folds != fold].index, cv_folds.loc[cv_folds == fold].index
    
    d_train = lgb.Dataset(train_X[train_idx], label=train_y[train_idx])
    d_valid = lgb.Dataset(train_X[valid_idx], label=train_y[valid_idx])
    watchlist = [d_train, d_valid]
    
    for bgs in range(n_bags):
        params['feature_fraction_seed'] = np.random.randint(10000)
        params['bagging_seed'] = np.random.randint(10000)
    
        model = lgb.train(params, train_set=d_train, num_boost_round=2600, valid_sets=watchlist, 
                          early_stopping_rounds=200, verbose_eval=False)
        
        tmpL = model.predict(train_X[valid_idx])
        tmpL = (tmpL - min(tmpL))/(max(tmpL) - min(tmpL))
        print("LGBM validation AUC: {0:.6f}".format(roc_auc_score(train_y[valid_idx], tmpL)))
        predsL += model.predict(test_X)
    print('='*30)

predsL /= (n_folds*n_bags)
predsL = (predsL - min(predsL))/(max(predsL) - min(predsL))
print('Predict LGBM completed.')

LGBM validation AUC: 0.977536
LGBM validation AUC: 0.978124
LGBM validation AUC: 0.977730
LGBM validation AUC: 0.977607
LGBM validation AUC: 0.977632
LGBM validation AUC: 0.978451
LGBM validation AUC: 0.978161
LGBM validation AUC: 0.977670
LGBM validation AUC: 0.977853
LGBM validation AUC: 0.978295
LGBM validation AUC: 0.973577
LGBM validation AUC: 0.974007
LGBM validation AUC: 0.974841
LGBM validation AUC: 0.974382
LGBM validation AUC: 0.974169
LGBM validation AUC: 0.980948
LGBM validation AUC: 0.980867
LGBM validation AUC: 0.980600
LGBM validation AUC: 0.981171
LGBM validation AUC: 0.980600
LGBM validation AUC: 0.980504
LGBM validation AUC: 0.979798
LGBM validation AUC: 0.980010
LGBM validation AUC: 0.979869
LGBM validation AUC: 0.979767
LGBM validation AUC: 0.981790
LGBM validation AUC: 0.981440
LGBM validation AUC: 0.981314
LGBM validation AUC: 0.980964
LGBM validation AUC: 0.981631
LGBM validation AUC: 0.980891
LGBM validation AUC: 0.980374
LGBM validation AUC: 0.980431
LGBM valid

In [22]:
submit = pd.concat([test_set[['FileID']], pd.Series(predsL)], axis=1)
submit.columns = ['FileID', 'Probability']
submit.to_csv('./LGBM_{0}bag_{1}.csv'.format(n_bags*n_folds, re.sub('-', '', str(datetime.date.today())[5:])),
              index=False)