# Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
poi_detail = pd.read_csv('ctr_data/table_poi_detail.csv')
request_detail = pd.read_csv('ctr_data/table_request_detail.csv')
request_detail.drop_duplicates(subset = 'request_id', keep = 'first', inplace = True)
user_detail = pd.read_csv('ctr_data/table_uuid_detail.csv')
user_detail.drop_duplicates(subset = 'uuid', keep = 'first', inplace = True)
deal_detail = pd.read_csv('ctr_data/table_deal_detail.csv')
train = pd.read_csv('ctr_data/table_impr_click_action_train.csv')
test = pd.read_csv('ctr_data/table_impr_click_action_test.csv')

In [3]:
train['train_test'] = 1
test['train_test'] = 0
train_test = pd.concat([train, test], axis = 0)
#train_test.drop(['ID', 'pos', 'time'], axis = 1, inplace = True)

# Merge 

In [4]:
# Cannot merge user_detail
request_user = request_detail.merge(user_detail, on = 'uuid', how = 'left')
train_test = train_test.merge(request_user, on = 'request_id', how = 'left').\
              merge(poi_detail, on = 'poi_id', suffixes=['_req', '_poi'], how = 'left')

In [5]:
train_test = train_test.merge(deal_detail, on = 'poi_id', how = 'left')

In [6]:
train_test['device_type'].fillna('NA', inplace = True)
train_test['avg_price'].fillna(train_test['avg_price'].mean(), inplace = True)
train_test['age'].fillna(train_test['age'].mean(), inplace = True)
train_test['request_time'].fillna('00', inplace = True)
train_test.fillna(-1.0, inplace = True)

In [13]:
train_test.columns

Index(['ID', 'action', 'poi_id', 'pos', 'request_id', 'time', 'train_test',
       'uuid', 'cate_id', 'request_time', 'latitude_req', 'longitude_req',
       'device_type', 'gender', 'age', 'job', 'cate_level1', 'cate_level2',
       'cate_level3', 'area_id', 'avg_price', 'poi_star', 'longitude_poi',
       'latitude_poi', 'deal_id', 'price', 'discount_price'],
      dtype='object')

# Feature Engineering

In [13]:
# Preprocess
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
standard = preprocessing.StandardScaler()
train_test.loc[:, 'device_type'] = pd.Series(encoder.fit_transform(train_test.loc[:, 'device_type']))
train_test['age'] = standard.fit_transform((train_test['age']).values.reshape(-1, 1))
train_test['avg_price'] = standard.fit_transform((train_test['avg_price']).values.reshape(-1, 1))
train_test['poi_star'] = standard.fit_transform((train_test['poi_star']).values.reshape(-1, 1))
#train_temp.loc[:, 'area_id'] = pd.Series(encoder.fit_transform(train_temp.loc[:, 'area_id']))
#train_temp.loc[:, 'cate_id'] = pd.Series(encoder.fit_transform(train_temp.loc[:, 'cate_id']))
#train_temp.loc[:, 'cate_level1'] = pd.Series(encoder.fit_transform(train_temp.loc[:, 'cate_level1']))
#train_temp.loc[:, 'cate_level2'] = pd.Series(encoder.fit_transform(train_temp.loc[:, 'cate_level2']))
#train_temp.loc[:, 'cate_level3'] = pd.Series(encoder.fit_transform(train_temp.loc[:, 'cate_level3']))

In [None]:
train_test['hour'] = train_test['request_time'].apply(lambda x: int(x[0:2]))
train_test['week'] = train_test['time'].apply(lambda x: int(x[-2:])%7)
#train_test.head()

In [None]:
# Discretization By K-means
'''from sklearn.cluster import KMeans
 
kmodel = KMeans(n_clusters = 20, n_jobs = 4) 
kmodel.fit(np.array(train_test['avg_price']).reshape((len(train_test['avg_price']), 1)))
kmodel_pred = kmodel.predict(np.array(train_test['avg_price']).reshape((len(train_test['avg_price']), 1)))
train_test.loc[:, 'avg_price_type'] = kmodel_pred'''

In [None]:
train_test.columns

In [None]:
needed_columns = ['poi_id', 'uuid', 'cate_id', 'hour', 'longitude_req', 'latitude_req',
               'device_type', 'gender', 'age', 'job', 'longitude_poi', 'latitude_poi',
                'cate_level2', 'cate_level3', 'area_id', 'week', 'deal_id', 'price', 'discount_price',
               'avg_price', 'poi_star']

# Train

In [None]:
from sklearn import utils

def undersampling(train, undersampling_rate):

    # Get the indices per target value
    idx_0 = train[train.action == 0].index
    idx_1 = train[train.action >= 1].index
    # Get original number of records per target value
    nb_0 = len(train.loc[idx_0])
    nb_1 = len(train.loc[idx_1])
    # Calculate the undersampling rate and resulting number of records with target=0
    undersampled_nb_0 = int(undersampling_rate*nb_0)
    print('Rate to undersample records with action = 0: {}'.format(undersampling_rate))
    print('Number of records with action = 0 after undersampling: {}'.format(undersampled_nb_0))
    # Randomly select records with target=0 to get at the desired a priori
    undersampled_idx = utils.shuffle(idx_0, n_samples = undersampled_nb_0)
    # Construct list with remaining indices
    idx_list = list(undersampled_idx) + list(idx_1)
    # Return undersample data frame
    train = train.loc[idx_list].reset_index(drop = True)

    return train

In [None]:
train_test_undersampling = undersampling(train_test, 0.05)

In [None]:
x = train_test_undersampling[needed_columns]
y = train_test_undersampling['action'] >= 1
y = y.astype(int)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(x, y, random_state = 0)

## LightGBM

In [None]:
import lightgbm as lgb

dtrain = lgb.Dataset(X_train, y_train)
dval = lgb.Dataset(X_val, y_val)

In [None]:
params = {
    'metric':'auc',
    'learning_rate':0.01,
    'num_leaves':127,
    'objective':'binary',
    'n_jobs':6,
    'early_stopping_round': 100,
    'feature_fraction':0.8,
    'feature_fraction_seed':2019
}

In [None]:
clf = lgb.train(params, dtrain, num_boost_round = 1000, valid_sets = dval, verbose_eval = 100)

### LightGBM Parameter tuning

In [None]:
lgb.plot_importance(clf, max_num_features = 20)

In [None]:
clf.save_model('model/model3.txt')

# Test w = 0.05

In [None]:
a = train_test[train_test['action'] == -1]

In [None]:
preds_xgb_test = clf.predict(a[needed_columns])
preds_xgb_test_re = preds_xgb_test/(preds_xgb_test+(1-preds_xgb_test)/0.05)

In [None]:
a['preds_xgb_test_re'] = preds_xgb_test_re

In [None]:
a = a.groupby(by = ['poi_id', 'request_id'], sort = False).agg('mean').reset_index()

In [None]:
testoutput = pd.DataFrame()
testoutput['action'] = a['preds_xgb_test_re']
testoutput.index.name = 'ID'

In [None]:
testoutput.to_csv('output/testoutputV2.11.csv')

In [None]:
testoutput.shape