In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm import tqdm, tqdm_notebook, tnrange
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression,HuberRegressor
from sklearn.linear_model import SGDClassifier,PassiveAggressiveClassifier,RidgeClassifier
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.svm import LinearSVC
from scipy.sparse import hstack,vstack
import datetime
import warnings
warnings.filterwarnings("ignore")

# Data Loading

In [2]:
train = pd.read_csv('../data/round1_diac2019_train.csv', parse_dates=['order_pay_time','goods_list_time','goods_delist_time'])
train.columns

Index(['order_detail_id', 'order_id', 'order_total_num', 'order_amount',
       'order_total_payment', 'order_total_discount', 'order_pay_time',
       'order_status', 'order_count', 'is_customer_rate',
       'order_detail_status', 'order_detail_goods_num', 'order_detail_amount',
       'order_detail_payment', 'order_detail_discount', 'customer_province',
       'customer_city', 'member_id', 'customer_id', 'customer_gender',
       'member_status', 'is_member_actived', 'goods_id', 'goods_price',
       'goods_status', 'goods_has_discount', 'goods_list_time',
       'goods_delist_time'],
      dtype='object')

In [3]:
train_last = train[((train['order_pay_time'].dt.date).astype(str)<='2013-07-03')]
train_label = train[(train['order_pay_time'].dt.date).astype(str)>='2013-07-04']

train_all = train[((train['order_pay_time'].dt.date).astype(str)<='2013-12-31')]
print('train_last shape:',train_last.shape,'train_label shape:',train_label.shape,'train_all shape:',train_all.shape)

last_data = pd.DataFrame(train_last[['customer_id']]).drop_duplicates(['customer_id']).dropna()
all_data = pd.DataFrame(train_all[['customer_id']]).drop_duplicates(['customer_id']).dropna()

train_last shape: (539577, 28) train_label shape: (861254, 28) train_all shape: (1400831, 28)


In [4]:
train_last['order_pay_month'] = train_last['order_pay_time'].dt.month
train_last['order_pay_dayofweek'] = train_last['order_pay_time'].dt.dayofweek
train_last['order_pay_day'] = train_last['order_pay_time'].dt.day

train_all['order_pay_month'] = train_last['order_pay_time'].dt.month
train_all['order_pay_dayofweek'] = train_last['order_pay_time'].dt.dayofweek
train_all['order_pay_day'] = train_last['order_pay_time'].dt.day

# Feature Engineering 

In [5]:
%%time
for idx,data in enumerate([train_last,train_all]):
    customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()
    data = data.sort_values(by=['customer_id','order_pay_time'])

    data['count'] = 1
    tmp = data.groupby(['customer_id'])['count'].agg({'customer_counts':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    tmp = data.groupby(['customer_id'])['customer_province'].last().reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    tmp = data.groupby(['customer_id'])['customer_city'].last().reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    tmp = data.groupby(['customer_id'])['member_status'].last().reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    tmp = data.groupby(['customer_id'])['is_member_actived'].last().reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')
    
    data['count'] = 1
    tmp = data[data['is_customer_rate']==0].groupby(['customer_id'])['count'].agg({'is_customer_rate_0':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    data['count'] = 1
    tmp = data[data['is_customer_rate']==1].groupby(['customer_id'])['count'].agg({'is_customer_rate_1':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')  
    
    data['count'] = 1
    tmp = data[(data['is_member_actived']==1) & (data['goods_has_discount']==1)].groupby(['customer_id'])['count'].agg({'is_customer_have_discount_count':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')
    
    if idx == 0:
        last_data = last_data.merge(customer_all, on='customer_id', how='left')
    else:
        all_data = all_data.merge(customer_all, on='customer_id', how='left')

CPU times: user 10.5 s, sys: 1.79 s, total: 12.3 s
Wall time: 12.3 s


In [6]:
%%time
for idx,data in enumerate([train_last,train_all]):
    customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()

    tmp = data.groupby(['customer_id'],as_index=False)['goods_price'].agg({'goods_price_max':'max','goods_price_min':'min','goods_price_mean':'mean'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    data['count'] = 1
    tmp = data[data['goods_has_discount']==1].groupby(['customer_id'])['count'].agg({'goods_has_discount_counts':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    data['count'] = 1
    tmp = data[data['goods_has_discount']==0].groupby(['customer_id'])['count'].agg({'goods_has_not_discount_counts':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    data['count'] = 1
    tmp = data[data['goods_status']==1].groupby(['customer_id'])['count'].agg({'goods_status_1':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

  
    
    if idx == 0:
        last_data = last_data.merge(customer_all, on='customer_id', how='left')
    else:
        all_data = all_data.merge(customer_all, on='customer_id', how='left')

CPU times: user 5.22 s, sys: 768 ms, total: 5.99 s
Wall time: 5.99 s


In [7]:
%%time
for idx,data in enumerate([train_last,train_all]):
    customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()

    tmp = data.groupby(['customer_id'])['order_amount'].agg({'order_amount_sum':'sum'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')    
    
    tmp = data.groupby(['customer_id'])['order_total_payment'].agg({'order_total_payment_sum':'sum','order_total_payment_count':'count'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    tmp = data.groupby(['customer_id'])['order_total_discount'].agg({'order_total_discount_sum':'sum'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') 

    tmp = data.groupby(['customer_id'])['order_status'].agg({'order_status_max':'max'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')   
    
    data['count'] = 1
    tmp = data[data['goods_status']==2].groupby(['customer_id'])['count'].agg({'goods_status_2':'count'}).reset_index()
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') 
    
     

    if idx == 0:
        last_data = last_data.merge(customer_all, on='customer_id', how='left')
    else:
        all_data = all_data.merge(customer_all, on='customer_id', how='left')

CPU times: user 7.11 s, sys: 536 ms, total: 7.65 s
Wall time: 8.87 s


In [8]:
%%time
for idx,data in enumerate([train_last,train_all]):
    customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()

    tmp = data.groupby(['customer_id'])['order_detail_amount'].agg({'order_detail_amount_sum':'sum'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')     
    
    tmp = data.groupby(['customer_id'])['order_detail_payment'].agg({'order_detail_payment_sum':'sum','order_detail_payment_count':'count'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')

    tmp = data.groupby(['customer_id'])['order_detail_discount'].agg({'order_detail_discount_sum':'sum'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') 

    tmp = data.groupby(['customer_id'])['order_detail_status'].agg({'order_detail_status_max':'max'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')   
    
    
    if idx == 0:
        last_data = last_data.merge(customer_all, on='customer_id', how='left')
    else:
        all_data = all_data.merge(customer_all, on='customer_id', how='left')

CPU times: user 6 s, sys: 216 ms, total: 6.22 s
Wall time: 6.21 s


In [14]:
# %%time
# for idx,data in enumerate([train_last,train_all]):
#     customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()
    
#     tmp = data.groupby(['customer_id'])['goods_id'].apply(lambda x:','.join(x.astype(str))).reset_index()
#     tmp.columns = ['customer_id','customer_goods_ids']
#     customer_all = customer_all.merge(tmp, on='customer_id', how='left')
    
#     X_seller = TfidfVectorizer(token_pattern='[0-9]+',binary=True).fit_transform(customer_all['customer_goods_ids'].fillna('0'))
#     seller_svd = TruncatedSVD(n_components=30,n_iter=30,random_state=2019).fit_transform(X_seller)
#     seller_svd_df = pd.DataFrame(seller_svd, columns=['customer_goods_svd_{}'.format(i) for i in range(1,31)])
#     customer_all = pd.concat([customer_all,seller_svd_df], axis=1)

#     if idx == 0:
#         last_data = last_data.merge(customer_all, on='customer_id', how='left')
#     else:
#         all_data = all_data.merge(customer_all, on='customer_id', how='left')

CPU times: user 17min 53s, sys: 40 s, total: 18min 33s
Wall time: 5min 22s


In [9]:
%%time
for idx,data in enumerate([train_last,train_all]):
    customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()
    data['order_pay_dayofyear'] = data['order_pay_time'].dt.dayofyear

    tmp = data.groupby(['customer_id'])['order_pay_dayofyear'].agg({'order_pay_dayofyear_max':'max','order_pay_dayofyear_min':'min'})
    customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')   

    if idx == 0:
        last_data = last_data.merge(customer_all, on='customer_id', how='left')
    else:
        all_data = all_data.merge(customer_all, on='customer_id', how='left')

CPU times: user 2.75 s, sys: 352 ms, total: 3.1 s
Wall time: 3.1 s


In [16]:
train_last.columns

Index(['order_detail_id', 'order_id', 'order_total_num', 'order_amount',
       'order_total_payment', 'order_total_discount', 'order_pay_time',
       'order_status', 'order_count', 'is_customer_rate',
       'order_detail_status', 'order_detail_goods_num', 'order_detail_amount',
       'order_detail_payment', 'order_detail_discount', 'customer_province',
       'customer_city', 'member_id', 'customer_id', 'customer_gender',
       'member_status', 'is_member_actived', 'goods_id', 'goods_price',
       'goods_status', 'goods_has_discount', 'goods_list_time',
       'goods_delist_time', 'order_pay_month', 'order_pay_dayofweek',
       'order_pay_day', 'count'],
      dtype='object')

In [10]:
for data in [last_data, all_data]:
    data['customer_city'] = LabelEncoder().fit_transform(data['customer_city'].fillna('None'))
    data['customer_province'] = LabelEncoder().fit_transform(data['customer_province'].fillna('None'))

In [11]:
def generate_label(data,label):
    data['label'] = 0
    valid_idx_list = list(label['customer_id'].unique())
    data['label'][data['customer_id'].isin(valid_idx_list)] = 1

    return data

last_data = generate_label(last_data,train_label)

In [12]:
last_data['order_pay_dayofyear_gap'] = last_data['order_pay_dayofyear_max'] - last_data['order_pay_dayofyear_min']
all_data['order_pay_dayofyear_gap'] = all_data['order_pay_dayofyear_max'] - all_data['order_pay_dayofyear_min']

# Model

In [58]:
last_data.drop(['order_pay_dayofyear_max','order_pay_dayofyear_min'], axis=1, inplace=True)
all_data.drop(['order_pay_dayofyear_max','order_pay_dayofyear_min'], axis=1, inplace=True)

In [13]:
origin_feat = ['customer_counts','goods_price_max', 'goods_price_min', 'goods_price_mean','member_status','is_member_actived',
               'customer_city','customer_province','goods_has_discount_counts','goods_has_not_discount_counts','goods_status_1',
              'goods_status_2','is_customer_rate_0','is_customer_rate_1','is_customer_have_discount_count']

main_order_feat = ['order_total_payment_sum','order_total_payment_count','order_total_discount_sum',
                   'order_amount_sum','order_status_max','order_pay_dayofyear_max','order_pay_dayofyear_min',
                   'order_pay_dayofyear_gap']
                  
                  

detail_order_feat = ['order_detail_payment_sum','order_detail_payment_count','order_detail_discount_sum',
                     'order_detail_amount_sum','order_detail_status_max']

##########################################################################################
feature = origin_feat + main_order_feat + detail_order_feat

X = last_data[feature]
y = last_data['label']
X_all = all_data[feature]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42,stratify=y)
print('Feature Length:',len(feature),' Data prepared......')

Feature Length: 28  Data prepared......


In [17]:
def re_logloss(labels,preds):   
    deta = 3.4
    y_true = labels   # you can try this eval metric for fun
    y_pred = preds
    p = np.clip(y_pred, 1e-10, 1-1e-10)
    loss = -1/len(y_true) * np.sum(y_true * np.log(p) * deta + (1 - y_true) * np.log(1-p))
    return 're_logloss',loss,False

lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=64, reg_alpha=0.1, reg_lambda=1.0,
                                max_depth=-1, n_estimators=10000, objective='binary', metrics='None', 
                                bagging_fraction=0.8, is_unbalance=False, bagging_freq=5, min_child_samples=80, 
                                feature_fraction=0.8, learning_rate=0.1, random_state=42, n_jobs=8,
                                )

eval_set = [(X_valid, y_valid)]
lgb_model.fit(X_train, y_train, eval_set=eval_set, eval_metric='logloss',verbose=50, early_stopping_rounds=100)
pred = lgb_model.predict_proba(X_all) #0.3958

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.395892
[100]	valid_0's binary_logloss: 0.39609
[150]	valid_0's binary_logloss: 0.396465
Early stopping, best iteration is:
[64]	valid_0's binary_logloss: 0.395797


In [16]:
res = all_data[['customer_id']]
res['result'] = pred[:,1]

data = pd.DataFrame(train[['customer_id']]).drop_duplicates(['customer_id']).dropna()
data = (data.merge(res,on=['customer_id'],how='left')).sort_values(['customer_id'])
data['customer_id'] = data['customer_id'].astype('int64')
data['result'] = data['result'].fillna(0)
result = data[['customer_id','result']]
result.to_csv('../out/round1_diac2019_test.csv', index=False)