In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from sklearn import preprocessing, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import lightgbm as lgb
from sklearn.utils import shuffle
import gc
import copy
import matplotlib.pyplot as plt


%matplotlib inline

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

warnings.filterwarnings('ignore')

In [2]:
df_history_action = pd.read_pickle('./temp/action_history_f.plk')
df_feature = pd.read_pickle('./temp/base_feature_f.plk')
df_courier = pd.read_pickle('./temp/courier.plk')
df_order = pd.read_pickle('./temp/order.plk')
df_distance = pd.read_pickle('./temp/distance.plk')

In [3]:
df_temp = df_history_action.groupby(['group'])['expect_time'].apply(
    lambda x: x.values.tolist()[-1]).reset_index()
df_temp.columns = ['group', 'current_time']
df_feature = df_feature.merge(df_temp, how='left')

df_temp = df_history_action.groupby(['group'])['tracking_id'].apply(
    lambda x: x.values.tolist()[-1]).reset_index()
df_temp.columns = ['group', 'last_tracking_id']
df_feature = df_feature.merge(df_temp, how='left')

df_temp = df_history_action.groupby(['group'])['action_type'].apply(
    lambda x: x.values.tolist()[-1]).reset_index()
df_temp.columns = ['group', 'last_action_type']
df_feature = df_feature.merge(df_temp, how='left')

In [4]:
df_distance = df_distance.rename(columns={'tracking_id': 'last_tracking_id',
                                          'source_type': 'last_action_type', 
                                          'target_tracking_id': 'tracking_id',
                                          'target_type': 'action_type'})
df_feature = df_feature.merge(df_distance.drop(
    ['courier_id', 'wave_index', 'date'], axis=1), how='left')

In [5]:
df_feature = df_feature.merge(
    df_order[['tracking_id', 'weather_grade', 'aoi_id', 'shop_id', 'promise_deliver_time',
              'estimate_pick_time']], how='left')

In [6]:
df_feature = df_feature.merge(df_courier, how='left')

In [7]:
df_sum=df_feature.groupby(['group']).size().reset_index()
df_sum.columns = ['group','unfinished_num_sum']
df_feature = df_feature.merge(df_sum, how='left')

df_order['group']=df_order['date'].astype(
    'str') + df_order['courier_id'].astype('str') + df_order['wave_index'].astype('str')
df_sum=df_order.groupby(['group']).size().reset_index()
df_sum.columns = ['group','all_order_num']
df_feature = df_feature.merge(df_sum, how='left')
df_feature['finish_freq']=df_feature['unfinished_num_sum']/df_feature['all_order_num']/2

In [8]:
df_feature['delta_lng']=df_feature['source_lng']-df_feature['target_lng']
df_feature['delta_lat']=df_feature['source_lat']-df_feature['target_lat']
df_feature['delta_abs_lng']=abs(df_feature['source_lng']-df_feature['target_lng'])
df_feature['delta_abs_lat']=abs(df_feature['source_lat']-df_feature['target_lat'])
df_feature['delta_dis']=(df_feature['delta_lng']**2+df_feature['delta_lat']**2)**0.5

In [9]:
df_feature['lng_max']=0
id_index=df_feature.groupby('group')['delta_lng'].idxmax()
df_feature.loc[id_index,'lng_max']=1

df_feature['lng_min']=0
id_index=df_feature.groupby('group')['delta_lng'].idxmin()
df_feature.loc[id_index,'lng_min']=1

df_feature['lat_max']=0
id_index=df_feature.groupby('group')['delta_lat'].idxmax()
df_feature.loc[id_index,'lat_max']=1

df_feature['lat_min']=0
id_index=df_feature.groupby('group')['delta_lat'].idxmin()
df_feature.loc[id_index,'lat_min']=1

In [10]:
df_feature['latest_grid']=0
id_index=df_feature.groupby('group')['grid_distance'].idxmin()
df_feature.loc[id_index,'latest_grid']=1

In [11]:
df_feature['delta_pick_time']=df_feature['estimate_pick_time']-df_feature['current_time']
df_feature['delta_deliver_time']=df_feature['promise_deliver_time']-df_feature['current_time']
df_feature['current_hour']=pd.to_datetime(df_feature['current_time'].values, utc=True, unit='s').tz_convert(
            "Asia/Shanghai").hour
df_feature['current_hour_bin']='other'
df_feature.loc[df_feature['current_hour'].isin([11,12,13]),'current_hour_bin']='lunch'
df_feature.loc[df_feature['current_hour'].isin([17,18,19]),'current_hour_bin']='dinner'

In [12]:
df_feature['latest_deliver']=0
id_index=df_feature.groupby('group')['delta_deliver_time'].idxmin()
df_feature.loc[id_index,'latest_deliver']=1
del id_index

In [13]:
df_feature.to_pickle('./temp/part1_feature_f.plk')

In [None]:
一些基本特征: 波次,行动种类,待送单数,总订单数,前一行动种类,天气,
距离相关特征: #波次开始位置,当前位置,目标取货位置,目标送达位置; 
            相对位置,相对距离,高德距离;
距离相关特征: 当前时间,取单时间,承诺送达时间,当前小时;
            取单时间-当前时间,承诺送达时间-当前时间;
骑手信息特征: 骑手id,level,speed,max_load

y:expect_time target

无用特征: #骑士id,#目标订单id,#前一订单id,aoi_id,shop_id,date

In [104]:
df_feature_base_name=['wave_index','action_type','unfinished_num_sum','last_action_type','weather_grade']
df_feature_distance_name=['delta_lng','delta_lat','delta_abs_lng','delta_abs_lat','delta_dis','grid_distance',
                         'latest_grid','lat_max','lat_min','lng_min','lng_max']
df_feature_time_name=['delta_pick_time','delta_deliver_time','latest_deliver','current_hour','current_hour_bin']
df_feature_courier_name=['level','speed','max_load']
df_feature_name=df_feature_base_name+df_feature_distance_name+df_feature_time_name+df_feature_courier_name

In [15]:
for f in df_feature.select_dtypes('object'):
    if f not in ['date', 'type','group']:
        print(f)
        lbl = LabelEncoder()
        df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

action_type
last_action_type
weather_grade
aoi_id
shop_id
current_hour_bin


In [67]:
df_testA = df_feature[df_feature['type'] == 'testA'].copy()
df_test = df_feature[df_feature['type'] == 'testB'].copy()
df_train = df_feature[df_feature['type'] == 'train'].copy()
df_train = shuffle(df_train, random_state=513)

In [68]:
ycol = 'target'
feature_names = df_feature_name
params = {
    'num_leaves':31,
    'n_estimators': 10000,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'is_unbalance':'true',
    'metrics':'group_acc',
    'early_stopping_rounds': 50,
    'num_threads':20,
    'seed':513
}

oof = []
prediction = df_test[['id', 'group']]
prediction['target'] = 0
df_importance_list = []

In [None]:
def wave_group_func(group):
    target_list = group['label'].values.tolist()
    pred_list = group['pred'].values.tolist()
    max_index = pred_list.index(max(pred_list))
    if target_list[max_index] == 1:
        return 1
    else:
        return 0
def group_acc(preds,dtrain):
    label=dtrain.get_label()
    pred_list=pd.DataFrame({'group':val_group,
                            'pred':preds,
                            'label':label})
    df_temp =pred_list.groupby(['group']).apply(wave_group_func).reset_index()
    df_temp.columns = ['group', 'label']
    acc = df_temp[df_temp['label'] == 1].shape[0] / df_temp.shape[0]
    return 'group_acc',float(acc),True

kfold = GroupKFold(n_splits=5)
kfold_in = GroupKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol], df_train['group'])):
    X_train = df_train.iloc[trn_idx][feature_names+['group']]
    Y_train = df_train.iloc[trn_idx][ycol]
    X_val = df_train.iloc[val_idx][feature_names+['group']]
    Y_val = df_train.iloc[val_idx][ycol]
    groupp=df_train.iloc[val_idx]['group'].values
    pred_val=np.zeros(X_val.shape[0])
if 1:
    for fold_in_id, (train_in_idx, val_in_idx) in enumerate(kfold_in.split(X_train, Y_train, X_train['group'])):
        val_group=X_train['group'].iloc[val_in_idx]
        train_set = lgb.Dataset(X_train.drop(columns='group').iloc[train_in_idx], Y_train.iloc[train_in_idx])
        val_set = lgb.Dataset(X_train.drop(columns='group').iloc[val_in_idx], Y_train.iloc[val_in_idx])
        model = lgb.train(params, train_set,valid_sets=val_set,
                          categorical_feature=['weather_grade','current_hour_bin'],
                          feval=group_acc,verbose_eval=10)
        pred_test = model.predict(X_val.drop(columns='group'))
        
        df_importance = pd.DataFrame({
        'column': model.feature_name(),
        'importance': model.feature_importance('gain')})
        df_importance_list.append(df_importance)
        pred_val += pred_test / 5
        print('========================================')
    pred_df = pd.DataFrame({'group':groupp,
                                'pred':pred_val,
                                'label':Y_val.values})
    df_temp = pred_df.groupby(['group']).apply(wave_group_func).reset_index()
    df_temp.columns = ['group', 'label']
    acc = df_temp[df_temp['label'] == 1].shape[0] / df_temp.shape[0]
    print('acc:', acc)
    print('=======================================')

In [69]:
params = {
    'num_leaves':31,
    'n_estimators': 200,
    'boosting_type': 'gbdt',
    'is_unbalance':'true',
    'objective': 'binary',
    'num_threads':20
}
train_set = lgb.Dataset(df_train[df_feature_name], df_train['target'],free_raw_data=False)
model_final = lgb.train(params,train_set,
                       categorical_feature=['weather_grade','current_hour_bin'],)
pred=model_final.predict(df_test[df_feature_name])
df_test['pred']=pred

In [70]:
time_train=copy.deepcopy(df_train[df_train['target']==1])
time_train['detla_time']=time_train['expect_time']-time_train['current_time']
param = {
    'num_leaves':31,
    'n_estimators': 10000,
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'metrics':'mae',
    'early_stopping_rounds': 50,
    'num_threads':20
}
train_time_set = lgb.Dataset(time_train[df_feature_name],time_train['detla_time'])
model_time_cv=lgb.cv(param,train_time_set,nfold=5,metrics='mae',
                     categorical_feature=['current_hour_bin','weather_grade'])
n_east=len(model_time_cv['l1-mean'])

param = {
    'num_leaves':31,
    'n_estimators': n_east,
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'num_threads':20
}
model_time=lgb.train(param,train_time_set,
                     categorical_feature=['current_hour_bin','weather_grade'])
pred_time=model_time.predict(df_test[df_feature_name])
df_test['expect_time']=df_test['current_time']+pred_time

In [86]:
# pred=model_final.predict(df_test[df_feature_name])
# df_test['pred']=pred
same_track_id=df_test.groupby('tracking_id').size()[df_test.groupby('tracking_id').size()==2].reset_index()['tracking_id'].values.tolist()
df_test_afterchoice=df_test[~((df_test['tracking_id'].isin(same_track_id)) & (df_test['action_type']==0))]
maxid=df_test_afterchoice.groupby('group')['pred'].idxmax()
df_sub=copy.deepcopy(df_test_afterchoice.loc[maxid])
df_sub.loc[df_sub['action_type']==0,'action_type']='DELIVERY'
df_sub.loc[df_sub['action_type']==1,'action_type']='PICKUP'
prediction=df_sub[['courier_id','wave_index','tracking_id','courier_wave_start_lng','courier_wave_start_lat',
                   'action_type','expect_time','date']]

In [74]:
import zipfile
mae=200
os.makedirs('./sub/{}'.format(int(mae)), exist_ok=True)
f = zipfile.ZipFile('./sub/{}.zip'.format(int(mae)), 'w', zipfile.ZIP_DEFLATED)
for date in prediction['date'].unique():
    df_temp = prediction[prediction['date'] == date]
    del df_temp['date']
    df_temp.to_csv('./sub/{}/action_{}.txt'.format(int(mae), date), index=False)
    f.write('./sub/{}/action_{}.txt'.format(int(mae), date), 'action_{}.txt'.format(date))
f.close()

In [98]:
df_testA['pred']=model_final.predict(df_testA[df_feature_name])
same_track_id=df_testA.groupby('tracking_id').size()[df_testA.groupby('tracking_id').size()==2].reset_index()['tracking_id'].values.tolist()
df_testA_afterchoice=df_testA[~((df_testA['tracking_id'].isin(same_track_id)) & (df_testA['action_type']==0))]
maxid=df_testA_afterchoice.groupby('group')['pred'].idxmax()
df_testA['target']=0
df_testA.loc[maxid,'target']=1

In [103]:
df_train_testA=pd.concat([df_train,df_testA])

In [105]:
ycol = 'target'
feature_names = df_feature_name
params = {
    'num_leaves':31,
    'n_estimators': 10000,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'is_unbalance':'true',
    'metrics':'group_acc',
    'early_stopping_rounds': 50,
    'num_threads':20,
    'seed':513
}

oof = []
prediction = df_test[['id', 'group']]
prediction['target'] = 0
df_importance_list = []

In [106]:
def wave_group_func(group):
    target_list = group['label'].values.tolist()
    pred_list = group['pred'].values.tolist()
    max_index = pred_list.index(max(pred_list))
    if target_list[max_index] == 1:
        return 1
    else:
        return 0
def group_acc(preds,dtrain):
    label=dtrain.get_label()
    pred_list=pd.DataFrame({'group':val_group,
                            'pred':preds,
                            'label':label})
    df_temp =pred_list.groupby(['group']).apply(wave_group_func).reset_index()
    df_temp.columns = ['group', 'label']
    acc = df_temp[df_temp['label'] == 1].shape[0] / df_temp.shape[0]
    return 'group_acc',float(acc),True

kfold = GroupKFold(n_splits=5)
kfold_in = GroupKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train_testA[feature_names], df_train_testA[ycol], df_train_testA['group'])):
    X_train = df_train_testA.iloc[trn_idx][feature_names+['group']]
    Y_train = df_train_testA.iloc[trn_idx][ycol]
    X_val = df_train_testA.iloc[val_idx][feature_names+['group']]
    Y_val = df_train_testA.iloc[val_idx][ycol]
    groupp=df_train_testA.iloc[val_idx]['group'].values
    pred_val=np.zeros(X_val.shape[0])
if 1:
    for fold_in_id, (train_in_idx, val_in_idx) in enumerate(kfold_in.split(X_train, Y_train, X_train['group'])):
        val_group=X_train['group'].iloc[val_in_idx]
        train_set = lgb.Dataset(X_train.drop(columns='group').iloc[train_in_idx], Y_train.iloc[train_in_idx])
        val_set = lgb.Dataset(X_train.drop(columns='group').iloc[val_in_idx], Y_train.iloc[val_in_idx])
        model = lgb.train(params, train_set,valid_sets=val_set,
                          categorical_feature=['weather_grade','current_hour_bin'],
                          feval=group_acc,verbose_eval=10)
        pred_test = model.predict(X_val.drop(columns='group'))
        
        df_importance = pd.DataFrame({
        'column': model.feature_name(),
        'importance': model.feature_importance('gain')})
        df_importance_list.append(df_importance)
        pred_val += pred_test / 5
        print('========================================')
    pred_df = pd.DataFrame({'group':groupp,
                                'pred':pred_val,
                                'label':Y_val.values})
    df_temp = pred_df.groupby(['group']).apply(wave_group_func).reset_index()
    df_temp.columns = ['group', 'label']
    acc = df_temp[df_temp['label'] == 1].shape[0] / df_temp.shape[0]
    print('acc:', acc)
    print('=======================================')

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's group_acc: 0.821843
[20]	valid_0's group_acc: 0.823788
[30]	valid_0's group_acc: 0.828543
[40]	valid_0's group_acc: 0.83092
[50]	valid_0's group_acc: 0.832721
[60]	valid_0's group_acc: 0.833225
[70]	valid_0's group_acc: 0.835314
[80]	valid_0's group_acc: 0.835963
[90]	valid_0's group_acc: 0.836467
[100]	valid_0's group_acc: 0.83726
[110]	valid_0's group_acc: 0.837476
[120]	valid_0's group_acc: 0.836755
[130]	valid_0's group_acc: 0.837043
[140]	valid_0's group_acc: 0.837115
Early stopping, best iteration is:
[98]	valid_0's group_acc: 0.837548
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's group_acc: 0.817592
[20]	valid_0's group_acc: 0.823716
[30]	valid_0's group_acc: 0.827462
[40]	valid_0's group_acc: 0.830272
[50]	valid_0's group_acc: 0.83409
[60]	valid_0's group_acc: 0.836971
[70]	valid_0's group_acc: 0.838052
[80]	valid_0's group_acc: 0.8387
[90]	valid_0's group_acc: 0.839349
[

In [107]:
params = {
    'num_leaves':31,
    'n_estimators': 200,
    'boosting_type': 'gbdt',
    'is_unbalance':'true',
    'objective': 'binary',
    'num_threads':20
}
train_set = lgb.Dataset(df_train_testA[df_feature_name], df_train_testA['target'])
model_final = lgb.train(params,train_set,
                        categorical_feature=['weather_grade','current_hour_bin'],)
pred=model_final.predict(df_test[df_feature_name])
df_test['pred']=pred

In [108]:
same_track_id=df_test.groupby('tracking_id').size()[df_test.groupby('tracking_id').size()==2].reset_index()['tracking_id'].values.tolist()
df_test_afterchoice=df_test[~((df_test['tracking_id'].isin(same_track_id)) & (df_test['action_type']==0))]
maxid=df_test_afterchoice.groupby('group')['pred'].idxmax()
df_sub=copy.deepcopy(df_test_afterchoice.loc[maxid])
df_sub.loc[df_sub['action_type']==0,'action_type']='DELIVERY'
df_sub.loc[df_sub['action_type']==1,'action_type']='PICKUP'
prediction=df_sub[['courier_id','wave_index','tracking_id','courier_wave_start_lng','courier_wave_start_lat',
                   'action_type','expect_time','date']]

In [112]:
import zipfile
mae=415
os.makedirs('./sub/{}'.format(int(mae)), exist_ok=True)
f = zipfile.ZipFile('./sub/{}.zip'.format(int(mae)), 'w', zipfile.ZIP_DEFLATED)
for date in prediction['date'].unique():
    df_temp = prediction[prediction['date'] == date]
    del df_temp['date']
    df_temp.to_csv('./sub/{}/action_{}.txt'.format(int(mae), date), index=False)
    f.write('./sub/{}/action_{}.txt'.format(int(mae), date), 'action_{}.txt'.format(date))
f.close()