In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle
from sklearn.model_selection import train_test_split
import time

path="../data/"
seed=416

pd.set_option('display.max_rows', 200)

%pwd

'/home/cseadmin/undergrad1/dz/GISCUP2021/model'

In [2]:
start = time.time()

train = pd.read_pickle(path+'train/train.pkl')
test = pd.read_pickle(path+'test/test.pkl')

end = time.time()
print("Read data time = {:.2f}s".format(end-start))

print(train.info())
print()
print(test.info())

Read data time = 2.92s
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8651005 entries, 0 to 291609
Columns: 328 entries, slice_id to type_time_ratio_3
dtypes: bool(1), datetime64[ns](1), float16(252), float32(7), int16(36), int32(20), int8(11)
memory usage: 5.7 GB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288076 entries, 0 to 288075
Columns: 328 entries, slice_id to type_time_ratio_3
dtypes: bool(1), datetime64[ns](1), float16(273), float32(15), int16(19), int32(3), int8(16)
memory usage: 187.1 MB
None


In [3]:
num_cols = [x for x in train.columns if x not in ['order_id', 'ata','date', 'driver_id',
                                                  'arrival_0','arrival_1','arrival_2','arrival_3','arrival_4']]
cat_cols = []
features = num_cols + cat_cols

train_x = train[features]
train_y = np.log(train['ata'])

test_x = test[features]

print(len(features), features, sep="\n")

324
['slice_id', 'distance', 'simple_eta', 'speed', 'speed_avg', 'speed_std', 'speed_min', 'speed_max', 'volume', 'old_driver', 'driver_count', 'driver_ata_minus_simple_eta_std', 'driver_ata_minus_simple_eta_avg', 'driver_ata_minus_simple_eta_min', 'driver_ata_minus_simple_eta_max', 'driver_ata_minus_simple_eta_skew', 'driver_ata_minus_simple_eta_kurt', 'driver_ata_std', 'driver_ata_avg', 'driver_ata_min', 'driver_ata_max', 'driver_ata_skew', 'driver_ata_kurt', 'driver_distance_std', 'driver_distance_avg', 'driver_distance_min', 'driver_distance_max', 'driver_distance_skew', 'driver_distance_kurt', 'driver_simple_eta_std', 'driver_simple_eta_avg', 'driver_simple_eta_min', 'driver_simple_eta_max', 'driver_simple_eta_skew', 'driver_simple_eta_kurt', 'driver_slice_id_std', 'driver_slice_id_avg', 'driver_slice_id_min', 'driver_slice_id_max', 'driver_slice_id_skew', 'driver_slice_id_kurt', 'slice_avg', 'slice_std', 'slice_min', 'slice_max', 'slice_count', 'slice_skew', 'slice_kurt', 'link_n

In [4]:
%%time

days = train['date'].unique()
tr_days, val_days = train_test_split(days, test_size=0.2, random_state=seed)

tr_x = train_x[train['date'].isin(tr_days)]
tr_y = train_y[train['date'].isin(tr_days)]
val_x = train_x[train['date'].isin(val_days)]
val_y = train_y[train['date'].isin(val_days)]

# tr_x, val_x, tr_y, val_y = train_test_split(train_x,train_y,test_size=0.2,random_state=seed)

train_set = lgb.Dataset(tr_x, tr_y, categorical_feature=cat_cols)
val_set = lgb.Dataset(val_x, val_y, categorical_feature=cat_cols)

CPU times: user 6.65 s, sys: 1.32 s, total: 7.97 s
Wall time: 7.97 s


In [5]:
lgb_paras = {
    'objective': 'regression',
    'metrics': 'rmse',
    'learning_rate': 0.1,
    'seed': seed,
    'feature_fraction': 1,
    'boost_from_average':'false',

    'num_leaves': 30,
    'max_depth': 5,
    'lambda_l1': 0.01,
    'lambda_l2': 0.02,
    'bagging_fraction': 0.8,
    'bagging_freq': 3,
    'min_data_in_leaf': 20,

    'num_threads': 24,

    'verbose': -1,
    'device': 'gpu'
}

In [None]:
%%time

lgb_model = lgb.train(lgb_paras, train_set, categorical_feature=cat_cols,
                      valid_sets=[train_set, val_set], early_stopping_rounds=500, num_boost_round=80000, verbose_eval=100)

In [None]:
feature_importance_df = pd.DataFrame()
feature_importance_df["feature"] = features
feature_importance_df["importance"] = lgb_model.feature_importance()
feature_importance_df.sort_values('importance', ascending=False)

In [None]:
pred_y = lgb_model.predict(test_x, num_iteration=lgb_model.best_iteration)

submit = pd.DataFrame(columns=['id','result'])
submit['id'] = test['order_id']
submit['result'] = np.exp(pred_y).astype('int')

import datetime
submit.to_csv(f'../results/lgb_submission_{str(datetime.datetime.now())}.csv', index=False)

import joblib
joblib.dump(lgb_model, f"../saved_model/lgb_model_{str(datetime.datetime.now())}")