In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle
from sklearn.model_selection import train_test_split
import time

path="../data/"
seed=416

pd.set_option('display.max_rows', 200)

%pwd

'/home/cseadmin/undergrad1/dz/GISCUP2021/model'

In [2]:
start = time.time()

train = pd.read_pickle(path+'train/train.pkl')
test = pd.read_pickle(path+'test/test.pkl')

end = time.time()
print("Read data time = {:.2f}s".format(end-start))

print(train.info())
print()
print(test.info())

Read data time = 1.35s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8651005 entries, 0 to 8651004
Columns: 151 entries, slice_id to type_time_ratio_3
dtypes: bool(1), datetime64[ns](1), float16(119), float32(1), int16(19), int32(4), int8(6)
memory usage: 2.5 GB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288076 entries, 0 to 288075
Columns: 151 entries, slice_id to type_time_ratio_3
dtypes: bool(1), datetime64[ns](1), float16(119), float32(1), int16(17), int32(3), int8(9)
memory usage: 84.1 MB
None


In [3]:
num_cols = [x for x in train.columns if x not in ['order_id', 'ata','date', 'driver_id',
                                                  'arrival_0','arrival_1','arrival_2','arrival_3','arrival_4']]
cat_cols = []
features = num_cols + cat_cols

train_x = train[features]
train_y = np.log(train['ata'])

test_x = test[features]

print(len(features), features, sep="\n")

147
['slice_id', 'distance', 'simple_eta', 'speed', 'speed_avg', 'speed_std', 'speed_min', 'speed_max', 'volume', 'old_driver', 'link_no', 'link_time_sum', 'link_time_avg', 'link_time_std', 'link_time_max', 'link_time_min', 'time_delay_max', 'time_delay_avg', 'time_delay_std', 'current_no_0', 'current_no_1', 'current_no_2', 'current_no_3', 'current_no_4', 'current_time_sum_0', 'current_time_sum_1', 'current_time_sum_2', 'current_time_sum_3', 'current_time_sum_4', 'current_time_avg_0', 'current_time_avg_1', 'current_time_avg_2', 'current_time_avg_3', 'current_time_avg_4', 'current_time_std_0', 'current_time_std_1', 'current_time_std_2', 'current_time_std_3', 'current_time_std_4', 'current_time_max_0', 'current_time_max_1', 'current_time_max_2', 'current_time_max_3', 'current_time_max_4', 'current_time_min_0', 'current_time_min_1', 'current_time_min_2', 'current_time_min_3', 'current_time_min_4', 'next_no_0', 'next_no_1', 'next_no_2', 'next_no_3', 'next_no_4', 'next_time_sum_0', 'next_ti

In [4]:
%%time

days = train['date'].unique()
tr_days, val_days = train_test_split(days, test_size=0.2, random_state=seed)

tr_x = train_x[train['date'].isin(tr_days)]
tr_y = train_y[train['date'].isin(tr_days)]
val_x = train_x[train['date'].isin(val_days)]
val_y = train_y[train['date'].isin(val_days)]

# tr_x, val_x, tr_y, val_y = train_test_split(train_x,train_y,test_size=0.2,random_state=seed)

train_set = lgb.Dataset(tr_x, tr_y, categorical_feature=cat_cols)
val_set = lgb.Dataset(val_x, val_y, categorical_feature=cat_cols)

In [5]:
lgb_paras = {
    'objective': 'regression',
    'metrics': 'rmse',
    'learning_rate': 0.1,
    'seed': seed,
    'feature_fraction': 1,
    'boost_from_average':'false',

    'num_leaves': 30,
    'max_depth': 5,
    'lambda_l1': 0.01,
    'lambda_l2': 0.02,
    'bagging_fraction': 0.8,
    'bagging_freq': 3,
    'min_data_in_leaf': 20,

    'num_threads': 24,

    'verbose': -1,
    'device': 'gpu'
}

In [6]:
%%time

lgb_model = lgb.train(lgb_paras, train_set, categorical_feature=cat_cols,
                      valid_sets=[train_set, val_set], early_stopping_rounds=500, num_boost_round=20000, verbose_eval=100)

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.184821	valid_1's rmse: 0.183039
[200]	training's rmse: 0.183232	valid_1's rmse: 0.181637
[300]	training's rmse: 0.182433	valid_1's rmse: 0.180992
[400]	training's rmse: 0.181864	valid_1's rmse: 0.180567
[500]	training's rmse: 0.181418	valid_1's rmse: 0.180264
[600]	training's rmse: 0.18102	valid_1's rmse: 0.179984
[700]	training's rmse: 0.180658	valid_1's rmse: 0.179752
[800]	training's rmse: 0.180359	valid_1's rmse: 0.179584
[900]	training's rmse: 0.180059	valid_1's rmse: 0.179411
[1000]	training's rmse: 0.17978	valid_1's rmse: 0.179258
[1100]	training's rmse: 0.179513	valid_1's rmse: 0.179104
[1200]	training's rmse: 0.179284	valid_1's rmse: 0.178995
[1300]	training's rmse: 0.179056	valid_1's rmse: 0.17889
[1400]	training's rmse: 0.178844	valid_1's rmse: 0.1788
[1500]	training's rmse: 0.178638	valid_1's rmse: 0.178711
[1600]	training's rmse: 0.178447	valid_1's rmse: 0.178626
[1700]	training's rmse: 

[14200]	training's rmse: 0.166046	valid_1's rmse: 0.176236
[14300]	training's rmse: 0.165977	valid_1's rmse: 0.17623
[14400]	training's rmse: 0.16591	valid_1's rmse: 0.176225
[14500]	training's rmse: 0.165841	valid_1's rmse: 0.176219
[14600]	training's rmse: 0.165773	valid_1's rmse: 0.176213
[14700]	training's rmse: 0.165703	valid_1's rmse: 0.176206
[14800]	training's rmse: 0.165635	valid_1's rmse: 0.176202
[14900]	training's rmse: 0.165568	valid_1's rmse: 0.176198
[15000]	training's rmse: 0.165501	valid_1's rmse: 0.176193
[15100]	training's rmse: 0.165432	valid_1's rmse: 0.176192
[15200]	training's rmse: 0.165363	valid_1's rmse: 0.176187
[15300]	training's rmse: 0.165298	valid_1's rmse: 0.176185
[15400]	training's rmse: 0.165231	valid_1's rmse: 0.17618
[15500]	training's rmse: 0.165165	valid_1's rmse: 0.17617
[15600]	training's rmse: 0.165099	valid_1's rmse: 0.176167
[15700]	training's rmse: 0.165031	valid_1's rmse: 0.176163
[15800]	training's rmse: 0.164967	valid_1's rmse: 0.176158
[

In [7]:
feature_importance_df = pd.DataFrame()
feature_importance_df["feature"] = features
feature_importance_df["importance"] = lgb_model.feature_importance()
feature_importance_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
0,slice_id,12851
116,link_length,11008
3,speed,8754
75,next_time_min_1,8331
129,next_no_ratio_2,7984
...,...,...
64,next_time_std_0,552
69,next_time_max_0,503
54,next_time_sum_0,461
49,next_no_0,329


In [8]:
pred_y = lgb_model.predict(test_x, num_iteration=lgb_model.best_iteration)

submit = pd.DataFrame(columns=['id','result'])
submit['id'] = test['order_id']
submit['result'] = np.exp(pred_y).astype('int')

import datetime
submit.to_csv(f'../results/lgb_submission_{str(datetime.datetime.now())}.csv', index=False)

# import joblib
# joblib.dump(lgb_model, f"../saved_model/lgb_model_{str(datetime.datetime.now())}")

['../saved_model/lgb_model_2021-07-17 12:35:56.902414']