In [1]:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy import sparse
import xgboost as xgb
import lightgbm as lgb
import cPickle
import time
import datetime
import math
from multiprocessing import cpu_count
import gc
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Constants define
ROOT_PATH = '../'
ONLINE = 1

In [3]:
target = 'label'
train_len = 4999
threshold = 0.5

In [None]:
########################################### Helper function ###########################################

In [4]:
def log(info):
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + str(info))

In [5]:
def merge_feat_count(df, df_feat, columns_groupby, new_column_name, type='int'):
    df_count = pd.DataFrame(df_feat.groupby(columns_groupby).size()).fillna(0).astype(type).reset_index()
    df_count.columns = columns_groupby + [new_column_name]
    df = df.merge(df_count, on=columns_groupby, how='left')
    return df, [new_column_name]

def merge_feat_onehot_count(df, df_feat, columns_groupby, prefix, type='int'):
    df_count = df_feat.groupby(columns_groupby).size().unstack().fillna(0).astype(type).reset_index()
    df_count.columns = [i if i == columns_groupby[0] else prefix + '_' + str(i) for i in df_count.columns]
    df = df.merge(df_count, on=columns_groupby[0], how='left')
    return df, list(np.delete(df_count.columns.values, 0))

def merge_feat_nunique(df, df_feat, columns_groupby, column, new_column_name, type='int'):
    df_nunique = pd.DataFrame(df_feat.groupby(columns_groupby)[column].nunique()).fillna(0).astype(type).reset_index()
    df_nunique.columns = columns_groupby + [new_column_name]
    df = df.merge(df_nunique, on=columns_groupby, how='left')
    return df, [new_column_name]

def merge_feat_min(df, df_feat, columns_groupby, column, new_column_name, type='float'):
    df_min = pd.DataFrame(df_feat.groupby(columns_groupby)[column].min()).fillna(0).astype(type).reset_index()
    df_min.columns = columns_groupby + [new_column_name]
    df = df.merge(df_min, on=columns_groupby, how='left')
    return df, [new_column_name]

def merge_feat_max(df, df_feat, columns_groupby, column, new_column_name, type='float'):
    df_max = pd.DataFrame(df_feat.groupby(columns_groupby)[column].max()).fillna(0).astype(type).reset_index()
    df_max.columns = columns_groupby + [new_column_name]
    df = df.merge(df_max, on=columns_groupby, how='left')
    return df, [new_column_name]

def merge_feat_mean(df, df_feat, columns_groupby, column, new_column_name, type='float'):
    df_mean = pd.DataFrame(df_feat.groupby(columns_groupby)[column].mean()).fillna(0).astype(type).reset_index()
    df_mean.columns = columns_groupby + [new_column_name]
    df = df.merge(df_mean, on=columns_groupby, how='left')
    return df, [new_column_name]

In [6]:
def eval_auc_f1(preds, dtrain):
    df = pd.DataFrame({'y_true': dtrain.get_label(), 'y_score': preds})
    df['y_pred'] = df['y_score'].apply(lambda x: 1 if x >= threshold else 0)
    auc = metrics.roc_auc_score(df.y_true, df.y_score)
    f1 = metrics.f1_score(df.y_true, df.y_pred)
    return 'feval', (auc * 0.6 + f1 * 0.4), True

def lgb_cv(train_x, train_y, params, rounds, folds):
    start = time.clock()
    log(str(train_x.columns))
    dtrain = lgb.Dataset(train_x, label=train_y)
    log('run cv: ' + 'round: ' + str(rounds))
    res = lgb.cv(params, dtrain, rounds, nfold=folds, 
                 metrics=['eval_auc_f1', 'auc'], feval=eval_auc_f1, 
                 early_stopping_rounds=200, verbose_eval=5)
    elapsed = (time.clock() - start)
    log('Time used:' + str(elapsed) + 's')
    # return len(res['auc-mean']), res['auc-mean'][len(res['auc-mean']) - 1]
    return len(res['feval-mean']), res['feval-mean'][len(res['feval-mean']) - 1], res['auc-mean'][len(res['auc-mean']) - 1]

def lgb_train_predict(train_x, train_y, test_x, params, rounds):
    start = time.clock()
    log(str(train_x.columns))
    dtrain = lgb.Dataset(train_x, label=train_y)
    valid_sets = [dtrain]
    model = lgb.train(params, dtrain, rounds, valid_sets, feval=eval_auc_f1, verbose_eval=5)
    pred = model.predict(test_x)
    elapsed = (time.clock() - start)
    log('Time used:' + str(elapsed) + 's')
    return model, pred

def store_result(test_index, pred, threshold, name):
    result = pd.DataFrame({'uid': test_index, 'prob': pred})
    result = result.sort_values('prob', ascending=False)
    result['label'] = 0
    result.loc[result.prob > threshold, 'label'] = 1
    result.to_csv('../data/output/sub/' + name + '.csv', index=0, header=0, columns=['uid', 'label'])
    return result

In [None]:
########################################### Read data ###########################################

In [7]:
train = pd.read_csv(ROOT_PATH + 'data/input/train/uid_train.txt', header=None, sep='\t')
train.columns = ['uid', 'label']
train_voice = pd.read_csv(ROOT_PATH + 'data/input/train/voice_train.txt', header=None, sep='\t')
train_voice.columns = ['uid', 'opp_num', 'opp_head', 'opp_len', 'start_time', 'end_time', 'call_type', 'in_out']
train_sms = pd.read_csv(ROOT_PATH + 'data/input/train/sms_train.txt', header=None, sep='\t')
train_sms.columns = ['uid', 'opp_num', 'opp_head', 'opp_len', 'start_time', 'in_out']
train_wa = pd.read_csv(ROOT_PATH + 'data/input/train/wa_train.txt', header=None, sep='\t')
train_wa.columns = ['uid', 'wa_name', 'visit_cnt', 'visit_dura', 'up_flow', 'down_flow', 'wa_type', 'date']

In [8]:
test = pd.DataFrame({'uid': ['u' + str(i) for i in range(5000, 7000)]})
test_voice = pd.read_csv(ROOT_PATH + 'data/input/test_a/voice_test_a.txt', header=None, sep='\t')
test_voice.columns = ['uid', 'opp_num', 'opp_head', 'opp_len', 'start_time', 'end_time', 'call_type', 'in_out']
test_sms = pd.read_csv(ROOT_PATH + 'data/input/test_a/sms_test_a.txt', header=None, sep='\t')
test_sms.columns = ['uid', 'opp_num', 'opp_head', 'opp_len', 'start_time', 'in_out']
test_wa = pd.read_csv(ROOT_PATH + 'data/input/test_a/wa_test_a.txt', header=None, sep='\t')
test_wa.columns = ['uid', 'wa_name', 'visit_cnt', 'visit_dura', 'up_flow', 'down_flow', 'wa_type', 'date']

In [9]:
df = pd.concat([train, test]).reset_index(drop=True)
df_voice = pd.concat([train_voice, test_voice]).reset_index(drop=True)
df_sms = pd.concat([train_sms, test_sms]).reset_index(drop=True)
df_wa = pd.concat([train_wa, test_wa]).reset_index(drop=True)

In [None]:
########################################### Preprocess ###########################################

In [11]:
# backup data
df_copy = df.copy()

In [None]:
########################################### Feature engineer ###########################################

In [25]:
# reset data
df = df_copy.copy()
predictors = []

In [26]:
df, predictors_tmp = merge_feat_count(df, df_voice, ['uid'], 'count_gb_uid_in_voice'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_count(df, df_sms, ['uid'], 'count_gb_uid_in_sms'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_count(df, df_wa, ['uid'], 'count_gb_uid_in_wa'); predictors += predictors_tmp

In [27]:
df, predictors_tmp = merge_feat_onehot_count(df, df_voice, ['uid', 'opp_len'], 'voice_opp_len'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_onehot_count(df, df_voice, ['uid', 'call_type'], 'voice_call_type'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_onehot_count(df, df_voice, ['uid', 'in_out'], 'voice_in_out_'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_onehot_count(df, df_sms, ['uid', 'opp_len'], 'sms_opp_len'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_onehot_count(df, df_sms, ['uid', 'in_out'], 'sms_in_out'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_onehot_count(df, df_wa, ['uid', 'wa_type'], 'wa_type'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_onehot_count(df, df_wa, ['uid', 'date'], 'wa_date'); predictors += predictors_tmp

In [28]:
df, predictors_tmp = merge_feat_nunique(df, df_voice, ['uid'], 'opp_num', 'nunique_oppNum_gb_uid_in_voice'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_nunique(df, df_voice, ['uid'], 'opp_head', 'nunique_oppHead_gb_uid_in_voice'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_nunique(df, df_sms, ['uid'], 'opp_num', 'nunique_oppNum_gb_uid_in_sms'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_nunique(df, df_sms, ['uid'], 'opp_head', 'nunique_oppHead_gb_uid_in_sms'); predictors += predictors_tmp
df, predictors_tmp = merge_feat_nunique(df, df_wa, ['uid'], 'wa_name', 'nunique_waName_gb_uid_in_wa'); predictors += predictors_tmp

In [29]:
col_list = ['visit_cnt', 'visit_dura', 'up_flow', 'down_flow']
for i in col_list:
    df, predictors_tmp = merge_feat_min(df, df_wa, ['uid'], i, 'min_%s_gb_uid_in_wa' % i); predictors += predictors_tmp
    df, predictors_tmp = merge_feat_max(df, df_wa, ['uid'], i, 'max_%s_gb_uid_in_wa' % i); predictors += predictors_tmp
    df, predictors_tmp = merge_feat_mean(df, df_wa, ['uid'], i, 'mean_%s_gb_uid_in_wa' % i); predictors += predictors_tmp

In [30]:
train_x = df.loc[:(train_len - 1), predictors]
train_y = df.loc[:(train_len - 1), target]
test_x = df.loc[train_len:, predictors]

In [None]:
########################################### LightGBM ###########################################

In [31]:
config_lgb = {
    'rounds': 10000,
    'folds': 5
}

params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 63,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    # 'min_sum_hessian_in_leaf': 10,
    'verbosity': 5,
    'num_threads': cpu_count() - 1,
    'seed': 7,
}

In [32]:
lgb_cv(train_x, train_y, params_lgb, config_lgb['rounds'], config_lgb['folds'])

2018-05-23 14:27:06 Index([u'count_gb_uid_in_voice', u'count_gb_uid_in_sms', u'count_gb_uid_in_wa',
       u'voice_opp_len_3', u'voice_opp_len_5', u'voice_opp_len_6',
       u'voice_opp_len_7', u'voice_opp_len_8', u'voice_opp_len_9',
       u'voice_opp_len_10',
       ...
       u'mean_visit_cnt_gb_uid_in_wa', u'min_visit_dura_gb_uid_in_wa',
       u'max_visit_dura_gb_uid_in_wa', u'mean_visit_dura_gb_uid_in_wa',
       u'min_up_flow_gb_uid_in_wa', u'max_up_flow_gb_uid_in_wa',
       u'mean_up_flow_gb_uid_in_wa', u'min_down_flow_gb_uid_in_wa',
       u'max_down_flow_gb_uid_in_wa', u'mean_down_flow_gb_uid_in_wa'],
      dtype='object', length=114)
2018-05-23 14:27:06 run cv: round: 10000
[5]	cv_agg's feval: 0.742857 + 0.00647818	cv_agg's auc: 0.885253 + 0.0068806
[10]	cv_agg's feval: 0.751567 + 0.00811904	cv_agg's auc: 0.895535 + 0.00534308
[15]	cv_agg's feval: 0.754651 + 0.00758081	cv_agg's auc: 0.898279 + 0.00518307
[20]	cv_agg's feval: 0.756208 + 0.00968548	cv_agg's auc: 0.899436 + 0.

(88, 0.77428177607176119, 0.90982089004040212)

In [33]:
model_lgb, pred_lgb = lgb_train_predict(train_x, train_y, test_x, params_lgb, 90)

2018-05-23 14:27:29 Index([u'count_gb_uid_in_voice', u'count_gb_uid_in_sms', u'count_gb_uid_in_wa',
       u'voice_opp_len_3', u'voice_opp_len_5', u'voice_opp_len_6',
       u'voice_opp_len_7', u'voice_opp_len_8', u'voice_opp_len_9',
       u'voice_opp_len_10',
       ...
       u'mean_visit_cnt_gb_uid_in_wa', u'min_visit_dura_gb_uid_in_wa',
       u'max_visit_dura_gb_uid_in_wa', u'mean_visit_dura_gb_uid_in_wa',
       u'min_up_flow_gb_uid_in_wa', u'max_up_flow_gb_uid_in_wa',
       u'mean_up_flow_gb_uid_in_wa', u'min_down_flow_gb_uid_in_wa',
       u'max_down_flow_gb_uid_in_wa', u'mean_down_flow_gb_uid_in_wa'],
      dtype='object', length=114)
[5]	training's auc: 0.957986	training's feval: 0.865289
[10]	training's auc: 0.967644	training's feval: 0.88754
[15]	training's auc: 0.972187	training's feval: 0.902914
[20]	training's auc: 0.976475	training's feval: 0.914456
[25]	training's auc: 0.982036	training's feval: 0.92596
[30]	training's auc: 0.985603	training's feval: 0.936038
[35]	tr

In [34]:
result = store_result(test.uid, pred_lgb, threshold, '20180523-lgb-%d-%d(r%d)' % (7742, 9098, 90))
result = store_result(test.uid, pred_lgb, threshold, 'submission')

In [36]:
imp = pd.DataFrame({'feature':train_x.columns.values, 'importance':list(model_lgb.feature_importance())})
imp = imp.sort_values(by = 'importance', ascending = False)
imp.to_csv(ROOT_PATH + 'data/output/feat_imp/imp-20180523-%d-%d(r%d).csv' % (7742, 9098, 90), index=False)