# 特征工程

In [None]:
import os
from tqdm import tqdm 
import time

import numpy as np 
import pandas as pd 
import scipy
import matplotlib 
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn
from sklearn import preprocessing, metrics 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.utils import shuffle# 随机排列
import gc
import lightgbm as lgb 
import xgboost as xgb 
import joblib
from joblib import Parallel, delayed

In [None]:
train_path = './data/eleme_round1_train_20200313/'
test_path = './data/eleme_round1_testA_20200313/'

In [None]:
# 读取数据并加入date列
def read_datafile(rootpath, section):
    file_path = rootpath + section + '/'
    data_list = []

    for f in os.listdir(file_path):
        date = f.split('.')[0].split('_')[1]
        if section == 'action':
            df = pd.read_csv(file_path+f,converters={'tracking_id':str})
        elif section == 'order':
            df = pd.read_csv(file_path+f,converters={'tracking_id':str})
        elif section == 'courier':
            df = pd.read_csv(file_path+f)
        elif section == 'distance':
            df = pd.read_csv(file_path+f,converters={'tracking_id':str, 'target_tracking_id':str})
        df['date'] = date
        data_list.append(df)
    
    return pd.concat(data_list)

In [None]:
def majorid(df):
    df['majorid'] = df['date'].map(str) + df['courier_id'].map(str) + '_' + df['wave_index'].map(str)
    return df

In [None]:
def dropdate(df):
    df.drop(['date', 'courier_id', 'wave_index'], axis=1, inplace=True)
    return df

In [None]:
# action的操作
action_train = read_datafile(train_path, 'action')
action_test = read_datafile(test_path, 'action')

In [None]:
action_train = majorid(action_train)
action_test = majorid(action_test)

In [None]:
def action_train_group(df):
    groups = df.groupby(['majorid'])
    df_future = []
    df_last = []
    for name, group in tqdm(groups):
        future_data = group.tail(int(group.shape[0] * 0.55))
        last_data = group.drop(future_data.index)

        # last操作
        last_data = last_data.tail(1)
        last_data.reset_index(drop=True, inplace=True)

        # 对future处理
        future_data['label'] = 0
        future_data.reset_index(drop=True, inplace=True)
        future_data.loc[0,'label'] = 1# 标记正负样本

        df_future.append(future_data)
        df_last.append(last_data)
    return_last = pd.concat(df_last)
    return_future = pd.concat(df_future)
    return_last.rename({'expect_time' : 'last_time'}, axis=1, inplace=True)# 把expecttime列重命名
    return_future = shuffle(return_future)# 随机打乱顺序
    return return_last, return_future

def action_test_group(df):
    groups = df.groupby(['majorid'])
    df_future = []
    df_last = []
    for name, group in tqdm(groups):
        future_data = group[group['expect_time']==0]
        last_data = group.drop(future_data.index)

        # last操作
        last_data = last_data.tail(1)
        last_data.reset_index(drop=True, inplace=True)

        # future操作
        future_data['label'] = None

        df_future.append(future_data)
        df_last.append(last_data)
    return_last = pd.concat(df_last)
    return_future = pd.concat(df_future)
    return_last.rename({'expect_time' : 'last_time'}, axis=1, inplace=True)
    return return_last, return_future

In [None]:
action_train_thelast, action_train_future = action_train_group(action_train)
action_test_thelast, action_test_future = action_test_group(action_test)

In [None]:
# distance数据读取
distance_train = read_datafile(train_path, 'distance')
distance_test = read_datafile(test_path, 'distance')

In [None]:
distance_train = majorid(distance_train)
distance_test = majorid(distance_test)

In [None]:
distance_train = dropdate(distance_train)
distance_test = dropdate(distance_test)

In [None]:
def tanlism_distance(df):
    df['target_tan'] = (df['source_lat']-df['target_lat']) / (df['source_lng']-df['target_lng']) # df会自动处理出正无穷和负无穷，很秀
    df['target_tan'] = np.arctan(df['target_tan'])
    df['target_tan'] = np.degrees(df['target_tan'])
    df['target_MHD'] = abs(df['source_lat']-df['target_lat']) + abs(df['source_lng']-df['target_lng'])# 加入曼哈顿距离

    df.drop(['source_lat', 'target_lat', 'source_lng', 'target_lng'], axis=1, inplace=True)
    return df

In [None]:
distance_train = tanlism_distance(distance_train)
distance_test = tanlism_distance(distance_test)

In [None]:
rename_rule = {'source_type' : 'action_type'}

distance_test.rename(rename_rule, axis=1, inplace=True)
distance_train.rename(rename_rule, axis=1, inplace=True)

In [None]:
feature_train = pd.merge(left=action_train_thelast,right=distance_train, on=['majorid', 'tracking_id', 'action_type'], how='left')
feature_test = pd.merge(left=action_test_thelast,right=distance_test, on=['majorid', 'tracking_id', 'action_type'], how='left')

In [None]:
rename_rule = {'tracking_id':'last_tracking_id', 'action_type':'last_action_type', 'target_tracking_id':'tracking_id', 'target_type':'action_type'}

feature_test.rename(rename_rule, axis=1, inplace=True)
feature_train.rename(rename_rule, axis=1, inplace=True)

In [None]:
feature_test.drop(['courier_wave_start_lng', 'courier_wave_start_lat'],axis=1,inplace=True)
feature_train.drop(['courier_wave_start_lng', 'courier_wave_start_lat'],axis=1,inplace=True)
feature_train = dropdate(feature_train)
feature_test = dropdate(feature_test)

In [None]:
feature_train = pd.merge(left=action_train_future,right=feature_train, on=['majorid', 'tracking_id', 'action_type'], how='left')
feature_test = pd.merge(left=action_test_future,right=feature_test, on=['majorid', 'tracking_id', 'action_type'], how='left')

In [None]:
del action_test, action_test_future, action_test_thelast
del action_train, action_train_future, action_train_thelast
del distance_test, distance_train

In [None]:
# 读取order
order_train = read_datafile(train_path, 'order')
order_test = read_datafile(test_path, 'order')

In [None]:
order_test = majorid(order_test)
order_train = majorid(order_train)

In [None]:
order_test = dropdate(order_test)
order_train = dropdate(order_train)

In [None]:
def tanlism_order(df):
    df['delivery_tan'] = (df['deliver_lat']-df['pick_lat']) / (df['deliver_lng']-df['pick_lng'])
    df['delivery_tan'] = np.arctan(df['delivery_tan'])
    df['delivery_tan'] = np.degrees(df['delivery_tan'])
    df['delivery_MHD'] = abs(df['deliver_lat']-df['pick_lat']) + abs(df['deliver_lng']-df['pick_lng'])# 加入曼哈顿距离

    df.drop(['deliver_lat', 'pick_lat', 'deliver_lng', 'pick_lng'], axis=1, inplace=True)
    return df

In [None]:
order_test = tanlism_order(order_test)
order_train = tanlism_order(order_train)

In [None]:
feature_test = pd.merge(left=feature_test, right=order_test, on=['majorid', 'tracking_id'], how='left')
feature_train = pd.merge(left=feature_train, right=order_train, on=['majorid', 'tracking_id'], how='left')

In [None]:
# courier操作
courier_train = read_datafile(train_path, 'courier')
courier_test = read_datafile(test_path, 'courier')

In [None]:
feature_test = pd.merge(left=feature_test, right=courier_test, on=['courier_id', 'date'], how='left')
feature_train = pd.merge(left=feature_train, right=courier_train, on=['courier_id', 'date'], how='left')

In [None]:
# 加入新的特征：id, rush和road
feature_train['id'] = range(feature_train.shape[0])
feature_test['id'] = range(feature_test.shape[0])

In [None]:
def add_rush(df):
    df['rush'] = (df['last_time']-df['create_time']) / (df['promise_deliver_time']-df['create_time'])
    return df

In [None]:
feature_train = add_rush(feature_train)
feature_test = add_rush(feature_test)

In [None]:
def add_road(df):
    df['now'] = df['last_time'].apply(lambda x:time.strftime('%a_%H', time.localtime(x)))
    df['is_holiday'] = df['now'].apply(lambda x: 1 if x.split('_')[0] in ['Sat', 'Sun'] else 0)
    busytime = ['7', '8', '11', '12','17' ,'18']
    normtime = ['5', '6', '9', '10', '13', '14', '15', '16', '19', '20', '21', '22']
    df['road'] = df['now'].apply(lambda x: 1 if x.split('_')[1] in busytime else 2 if x.split('_')[1] in normtime else 3)
    df.drop(['now'], axis=1, inplace=True)
    return df

In [None]:
feature_test = add_road(feature_test)
feature_train = add_road(feature_train)

In [None]:
# weather_grade转化为特征
def weather(x):
    if x == '正常天气':
        x = 4
    elif x == '轻微恶劣天气':
        x = 3
    elif x == '恶劣天气':
        x = 2
    elif x == '极恶劣天气':
        x = 1
    else:
        x = 0
    return x

In [None]:
feature_train['weather_grade'] = feature_train['weather_grade'].apply(lambda x: weather(x))
feature_test['weather_grade'] = feature_test['weather_grade'].apply(lambda x: weather(x))

In [None]:
feature_test = pd.read_pickle('./temp/feature_test.pkl')
feature_train = pd.read_pickle('./temp/feature_train.pkl')

In [None]:
# 加入expect_time-create_time和promise_deliver_time-expect_time作为预测的目标值
feature_train['expect_used_time'] = feature_train['expect_time'] - feature_train['create_time']
feature_train['will_residue_time'] = feature_train['promise_deliver_time'] - feature_train['expect_time']
feature_test['expect_used_time'] = 0
feature_test['will_residue_time'] = 0

In [None]:
feature_test.to_pickle('./temp/feature_test.pkl')
feature_train.to_pickle('./temp/feature_train.pkl')

In [None]:
feature_train.info()

# 异常值处理

In [None]:
import os
import time 
import datetime
from tqdm import tqdm 

import numpy as np 
import pandas as pd 
import scipy
import matplotlib 
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn
from sklearn import preprocessing, metrics 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle# 随机排列
import gc
import lightgbm as lgb 
import xgboost as xgb 
import joblib
from joblib import Parallel, delayed

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
feature_test = pd.read_pickle('./temp/feature_test.pkl')
feature_train = pd.read_pickle('./temp/feature_train.pkl')

In [None]:
# 异常值处理的代码，可以随便调用。
def outliers_proc(data, col_name, scale=3):
    """
    用于清洗异常值，默认用 box_plot（scale=3）进行清洗
    :param data: 接收 pandas 数据格式
    :param col_name: pandas 列名
    :param scale: 尺度
    :return:
    """

    def box_plot_outliers(data_ser, box_scale):
        """
        利用箱线图去除异常值
        :param data_ser: 接收 pandas.Series 数据格式
        :param box_scale: 箱线图尺度，
        :return:
        """
        iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25))
        val_low = data_ser.quantile(0.25) - iqr
        val_up = data_ser.quantile(0.75) + iqr
        rule_low = (data_ser < val_low)
        rule_up = (data_ser > val_up)
        return (rule_low, rule_up), (val_low, val_up)

    data_n = data.copy()
    data_series = data_n[col_name]
    rule, value = box_plot_outliers(data_series, box_scale=scale)
    index = np.arange(data_series.shape[0])[rule[0] | rule[1]]
    print("Delete number is: {}".format(len(index)))
    data_n = data_n.drop(index)
    data_n.reset_index(drop=True, inplace=True)
    print("Now column number is: {}".format(data_n.shape[0]))
    index_low = np.arange(data_series.shape[0])[rule[0]]
    outliers = data_series.iloc[index_low]
    print("Description of data less than the lower bound is:")
    print(pd.Series(outliers).describe())
    index_up = np.arange(data_series.shape[0])[rule[1]]
    outliers = data_series.iloc[index_up]
    print("Description of data larger than the upper bound is:")
    print(pd.Series(outliers).describe())
    
    fig, ax = plt.subplots(1, 2, figsize=(10, 7))
    sns.boxplot(y=data[col_name], data=data, palette="Set1", ax=ax[0])
    sns.boxplot(y=data_n[col_name], data=data_n, palette="Set1", ax=ax[1])
    return data_n

In [None]:
# 下边是实际需要运行的
feature_train = outliers_proc(feature_train, 'expect_used_time')
feature_train = outliers_proc(feature_train, 'will_residue_time')

In [None]:
feature_test.to_pickle('./temp/feature_test_reg.pkl')
feature_train.to_pickle('./temp/feature_train_reg.pkl')

# 回归任务

In [None]:
import os
import time 
import datetime
from tqdm import tqdm 

import numpy as np 
import pandas as pd 
import scipy
import matplotlib 
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn
from sklearn import preprocessing, metrics 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle# 随机排列
import gc
import lightgbm as lgb 
import xgboost as xgb 
import joblib
from joblib import Parallel, delayed

In [None]:
feature_test = pd.read_pickle('./temp/feature_test_reg.pkl')
feature_train = pd.read_pickle('./temp/feature_train_reg.pkl')

In [None]:
reg_prediction = feature_test
reg_prediction['expect_time'] = 0
reg_prediction['expect_used_time'] = 0
reg_prediction['will_residue_time'] = 0

In [None]:
# # 建立时间预测的回归任务1
# y_col = 'expect_used_time'
# x_col = ['weather_grade', 'level', 'speed', 'max_load', 'is_holiday', 'rush', 'road', 'grid_distance', 'target_tan', 'delivery_tan', 'delivery_MHD', 'target_MHD']# 加入expect_used_time和will_residue_time之后的x_col

# t0 = time.time()
# model = lgb.LGBMRegressor(
#     metric = 'mae',
#     num_leaves = 50,
#     max_depth = 7,
#     n_estimators = 10000000,
#     learning_rate = 0.1,
#     bagging_fraction = 1,
#     feature_fraction = 1,
#     reg_alpha = 0,
#     reg_lambda = 1
# )

# valueK = 10
# oof = []
# df_importance_list = []

# kfold = KFold(n_splits=valueK, shuffle=True, random_state=2020)
# for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(feature_train[x_col], feature_train[y_col])):
#     X_train = feature_train.iloc[trn_idx][x_col]
#     Y_train = feature_train.iloc[trn_idx][y_col]

#     X_val = feature_train.iloc[val_idx][x_col]
#     Y_val = feature_train.iloc[val_idx][y_col]

#     print('\nFold{} Training ======================================\n'.format(fold_id+1))

#     lgb_model = model.fit(
#         X_train,
#         Y_train,
#         eval_names=['train', 'valid'],
#         eval_set=[(X_train, Y_train), (X_val, Y_val)],
#         verbose=500,
#         eval_metric='mae',
#         early_stopping_rounds=100
#     )

#     pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)
#     df_oof = feature_train.iloc[val_idx][['id', y_col]].copy()
#     df_oof['pred'] = pred_val
#     oof.append(df_oof)

#     pred_test = lgb_model.predict(feature_test[x_col], num_iteration=lgb_model.best_iteration_)
#     reg_prediction['expect_used_time'] += (pred_test/valueK)

#     df_importance = pd.DataFrame({
#         'column': x_col,
#         'importance': lgb_model.feature_importances_
#     })
#     df_importance_list.append(df_importance)
#     # break

#     # del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
#     # gc.collect()
# t1 = time.time()
# print('end train, use{} second'.format(t1-t0))


In [None]:
# df_oof = pd.concat(oof)
# mae = metrics.mean_absolute_error(df_oof[y_col], df_oof['pred'])
# print('mae:', mae)

In [None]:
# df_importance = pd.concat(df_importance_list)
# df_importance = df_importance.groupby(['column'])['importance'].agg('mean').sort_values(ascending=False).reset_index()
# df_importance

In [None]:
# 建立时间预测的回归任务2
y_col = 'will_residue_time'
# x_col = ['weather_grade', 'level', 'speed', 'max_load', 'is_holiday', 'rush', 'road', 'grid_distance', 'target_tan', 'delivery_tan', 'delivery_MHD', 'target_MHD']# 加入expect_used_time和will_residue_time之后的x_col
# x_col = ['weather_grade', 'level', 'speed', 'max_load', 'rush', 'road', 'grid_distance', 'delivery_MHD', 'target_MHD']
x_col = ['speed', 'rush', 'grid_distance', 'delivery_MHD', 'target_MHD']


t0 = time.time()
model = lgb.LGBMRegressor(
    metric = 'mae',
    num_leaves = 64,
    max_depth = 7,
    n_estimators = 300,
    learning_rate = 0.1,
    bagging_fraction = 1,
    feature_fraction = 0.8,
    reg_alpha = 0,
    reg_lambda = 0
)

valueK = 10
oof = []
df_importance_list = []

kfold = KFold(n_splits=valueK, shuffle=True, random_state=2020)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(feature_train[x_col], feature_train[y_col])):
    X_train = feature_train.iloc[trn_idx][x_col]
    Y_train = feature_train.iloc[trn_idx][y_col]

    X_val = feature_train.iloc[val_idx][x_col]
    Y_val = feature_train.iloc[val_idx][y_col]

    print('\nFold{} Training ======================================\n'.format(fold_id+1))

    lgb_model = model.fit(
        X_train,
        Y_train,
        eval_names=['train', 'valid'],
        eval_set=[(X_train, Y_train), (X_val, Y_val)],
        verbose=500,
        eval_metric='mae',
        early_stopping_rounds=100
    )

    pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = feature_train.iloc[val_idx][['id', y_col]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(feature_test[x_col], num_iteration=lgb_model.best_iteration_)
    reg_prediction['will_residue_time'] += (pred_test/valueK)

    df_importance = pd.DataFrame({
        'column': x_col,
        'importance': lgb_model.feature_importances_
    })
    df_importance_list.append(df_importance)
    # break

    # del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    # gc.collect()
t1 = time.time()
print('end train, use{} second'.format(t1-t0))

In [None]:
df_oof = pd.concat(oof)
mae = metrics.mean_absolute_error(df_oof[y_col], df_oof['pred'])
print('mae:', mae)

In [None]:
# 还原expect_time
# reg_prediction['expect_time'] = ((reg_prediction['create_time']+reg_prediction['expect_used_time']) + (reg_prediction['promise_deliver_time']-reg_prediction['will_residue_time'])) / 2
reg_prediction['expect_time'] = reg_prediction['promise_deliver_time'] - reg_prediction['will_residue_time']# 只使用后一个

In [None]:
reg_prediction.head()

In [None]:
feature_train.to_pickle('./temp/regfuture_train_regend.pkl')
reg_prediction.to_pickle('./temp/regfuture_test_regend.pkl')

# 分类特征

In [None]:
import os
import time 
import datetime
from tqdm import tqdm 

import numpy as np 
import pandas as pd 
import scipy
import matplotlib 
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn
from sklearn import preprocessing, metrics 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle# 随机排列
import gc
import lightgbm as lgb 
import xgboost as xgb 
import joblib
from joblib import Parallel, delayed

In [None]:
feature_train = pd.read_pickle('./temp/regfuture_train_regend.pkl')
feature_test = pd.read_pickle('./temp/regfuture_test_regend.pkl')

In [None]:
# 特征 deadline
def deadLine(df):
    df['deadline'] = df['promise_deliver_time'] - df['expect_time']
    df['need_speed'] = df['grid_distance'] / df['deadline']
    return df

In [None]:
feature_train = deadLine(feature_train)
feature_test = deadLine(feature_test)

In [None]:
# 异常值处理的代码，可以随便调用。
def outliers_proc(data, col_name, scale=3):
    """
    用于清洗异常值，默认用 box_plot（scale=3）进行清洗
    :param data: 接收 pandas 数据格式
    :param col_name: pandas 列名
    :param scale: 尺度
    :return:
    """

    def box_plot_outliers(data_ser, box_scale):
        """
        利用箱线图去除异常值
        :param data_ser: 接收 pandas.Series 数据格式
        :param box_scale: 箱线图尺度，
        :return:
        """
        iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25))
        val_low = data_ser.quantile(0.25) - iqr
        val_up = data_ser.quantile(0.75) + iqr
        rule_low = (data_ser < val_low)
        rule_up = (data_ser > val_up)
        return (rule_low, rule_up), (val_low, val_up)

    data_n = data.copy()
    data_series = data_n[col_name]
    rule, value = box_plot_outliers(data_series, box_scale=scale)
    index = np.arange(data_series.shape[0])[rule[0] | rule[1]]
    print("Delete number is: {}".format(len(index)))
    data_n = data_n.drop(index)
    data_n.reset_index(drop=True, inplace=True)
    print("Now column number is: {}".format(data_n.shape[0]))
    index_low = np.arange(data_series.shape[0])[rule[0]]
    outliers = data_series.iloc[index_low]
    print("Description of data less than the lower bound is:")
    print(pd.Series(outliers).describe())
    index_up = np.arange(data_series.shape[0])[rule[1]]
    outliers = data_series.iloc[index_up]
    print("Description of data larger than the upper bound is:")
    print(pd.Series(outliers).describe())
    
    fig, ax = plt.subplots(1, 2, figsize=(10, 7))
    sns.boxplot(y=data[col_name], data=data, palette="Set1", ax=ax[0])
    sns.boxplot(y=data_n[col_name], data=data_n, palette="Set1", ax=ax[1])
    return data_n

In [None]:
feature_train = outliers_proc(feature_train, 'need_speed')

In [None]:
feature_train['need_speed'].plot.hist()

In [None]:
def is_Picked(df):
    df['is_picked'] = df['last_time'] - df['estimate_pick_time']
    df['is_picked'] = df['is_picked'].apply(lambda x: 0 if x<0 else 1)
    return df

In [None]:
feature_test = is_Picked(feature_test)
feature_train = is_Picked(feature_train)

In [None]:
feature_train.to_pickle('./temp/future_train_clf.pkl')
feature_test.to_pickle('./temp/future_test_clf.pkl')

# 分类模型

In [1]:
import os
import time 
import datetime
from tqdm import tqdm 

import numpy as np 
import pandas as pd 
import scipy
import matplotlib 
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn
from sklearn import preprocessing, metrics 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle# 随机排列
import gc
import lightgbm as lgb 
import xgboost as xgb 
import joblib
from joblib import Parallel, delayed

In [2]:
feature_train = pd.read_pickle('./temp/future_train_clf.pkl')
feature_test = pd.read_pickle('./temp/future_test_clf.pkl')

In [3]:
feature_train.info()# needspeed和speed是不是取个比值更好？

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243158 entries, 0 to 243157
Data columns (total 38 columns):
courier_id                243158 non-null int64
wave_index                243158 non-null int64
tracking_id               243158 non-null object
courier_wave_start_lng    243158 non-null float64
courier_wave_start_lat    243158 non-null float64
action_type               243158 non-null object
expect_time               243158 non-null int64
date                      243158 non-null object
majorid                   243158 non-null object
label                     243158 non-null int64
last_tracking_id          243158 non-null object
last_action_type          243158 non-null object
last_time                 243158 non-null int64
grid_distance             243158 non-null float64
target_tan                237516 non-null float64
target_MHD                243158 non-null float64
weather_grade             243158 non-null int64
create_time               243158 non-null int64
confirm_t

In [None]:
def speed_Rush(df):
    df['speed_rush'] = df['speed'] - df['need_speed']
    return df 
feature_test = speed_Rush(feature_test)
feature_train = speed_Rush(feature_train)

In [None]:
prediction = feature_test[['courier_id', 'wave_index', 'tracking_id', 'courier_wave_start_lng', 'courier_wave_start_lat', 'action_type', 'expect_time', 'date', 'id', 'majorid', 'label']]
prediction['label'] = 0

In [None]:
y_col = 'label'
# x_col = ['grid_distance', 'target_tan', 'target_MHD', 'weather_grade', 'aoi_id', 'shop_id', 'delivery_tan', 'delivery_MHD', 'level', 'speed', 'max_load', 'rush', 'road', 'expect_used_time', 'will_residue_time', 'deadline', 'need_speed', 'is_picked']
x_col = ['grid_distance', 'target_tan', 'target_MHD', 'weather_grade', 'delivery_tan', 'delivery_MHD', 'level', 'speed', 'max_load', 'rush', 'road', 'expect_used_time', 'will_residue_time', 'deadline', 'need_speed', 'is_picked', 'speed_rush']

model = lgb.LGBMClassifier(num_leaves=50,
                           max_depth=7,
                           learning_rate=0.03,
                           n_estimators=100000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.8,
                           reg_lambda=0.8,
                           random_state=2020,
                           metric=None
                           )


oof = []
df_importance_list = []

kfold = GroupKFold(n_splits=10)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(feature_train[x_col], feature_train[y_col], feature_train['majorid'])):
    X_train = feature_train.iloc[trn_idx][x_col]
    Y_train = feature_train.iloc[trn_idx][y_col]

    X_val = feature_train.iloc[val_idx][x_col]
    Y_val = feature_train.iloc[val_idx][y_col]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(
        X_train,
        Y_train,
        eval_names=['train', 'valid'],
        eval_set=[(X_train, Y_train), (X_val, Y_val)],
        verbose=200,
        eval_metric='auc',
        early_stopping_rounds=100
    )

    pred_val = lgb_model.predict_proba(X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = feature_train.iloc[val_idx][['id', 'majorid', y_col]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(feature_test[x_col], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['label'] += pred_test / 10

    df_importance = pd.DataFrame({
        'column': x_col,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

In [None]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

In [None]:
def wave_label_func(group):
    target_list = group['label'].values.tolist()
    pred_list = group['pred'].values.tolist()
    max_index = pred_list.index(max(pred_list))
    if target_list[max_index] == 1:
        return 1
    else:
        return 0

In [None]:
df_oof = pd.concat(oof)
df_temp = df_oof.groupby(['majorid']).apply(wave_label_func).reset_index()
df_temp.columns = ['majorid', 'label']
acc = df_temp[df_temp['label'] == 1].shape[0] / df_temp.shape[0]
print('acc:', acc)

In [None]:
def result_func(majorid):
    majorid = majorid.values.tolist()
    max_index = majorid.index(max(majorid))
    result = np.zeros(len(majorid))
    result[max_index] = 1
    return result

In [None]:
prediction['rusult'] = prediction.groupby(['majorid'])['label'].transform(result_func)
subfile = prediction[prediction['rusult'] == 1]

In [None]:
result = subfile[['courier_id', 'wave_index', 'tracking_id', 'courier_wave_start_lng', 'courier_wave_start_lat', 'action_type', 'expect_time', 'date']]

In [None]:
import zipfile
os.makedirs('./sub/{}'.format('result'), exist_ok=True)
f = zipfile.ZipFile('./sub/{}.zip'.format('result'), 'w', zipfile.ZIP_DEFLATED)
for date in result['date'].unique():
    df_temp = result[prediction['date'] == date]
    del df_temp['date']
    df_temp.to_csv('./sub/{}/action_{}.txt'.format('result', date), index=False)
    f.write('./sub/{}/action_{}.txt'.format('result', date), 'action_{}.txt'.format(date))
f.close()