In [1]:
import numpy as np
import pandas as pd
import json
import pickle
from sklearn.metrics import f1_score
import lightgbm as lgb
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import roc_curve
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

pd.set_option('display.max_rows',200)

In [2]:
train = pd.read_csv('/media/fan/hdd/giscup/train.csv')
test = pd.read_csv('/media/fan/hdd/giscup/test.csv')

In [3]:
# test = pd.read_csv('/media/fan/hdd/giscup/test.csv')
test['date'] = '20200901'
train['date'] = pd.to_datetime(train['date'].astype('str'))
test['date'] = pd.to_datetime(test['date'])

In [4]:
for df in [train,test]:
    df['day_bias'] = (df['date']-pd.Timestamp(2020,8,1)).dt.days
    df['weekday'] = df['date'].dt.weekday<5
    df['cross_no'] = df['cross_no']*(df['cross_sum']>0)
    df['link_length'] = df['distance']/df['link_no']
    df['log_distance'] = np.log(df['distance'])
    df['log_simple_eta'] = np.log(df['simple_eta'])
    
    for i in range(5):
        df['current_no_ratio_'+str(i)] = df['current_no_'+str(i)]/df['link_no']
        df['current_time_ratio_'+str(i)] = df['current_time_sum_'+str(i)]/(df['simple_eta']-df['cross_sum'])
        df['next_no_ratio_'+str(i)] = df['next_no_'+str(i)]/df['link_no']
        df['next_time_ratio_'+str(i)] = df['next_time_sum_'+str(i)]/(df['simple_eta']-df['cross_sum'])
    for i in range(4):
        df['type_no_ratio_'+str(i)] = df['type_no_'+str(i)]/df['link_no']
        df['type_time_ratio_'+str(i)] = df['type_time_sum_'+str(i)]/(df['simple_eta']-df['cross_sum'])

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
train = reduce_mem_usage(train)

Memory usage after optimization is: 1856.30 MB
Decreased by 73.7%


In [6]:
num_cols = [x for x in train.columns if x not in ['order_id', 'ata','date', 'driver_id',
                                                  'arrival_0','arrival_1','arrival_2','arrival_3','arrival_4']]
cat_cols = []
features = num_cols+cat_cols
train_x = train[features]
test_x = test[features]
train_y = np.log(train['ata'])
print(len(features),features)

104 ['slice_id', 'distance', 'simple_eta', 'speed', 'speed_avg', 'speed_std', 'speed_min', 'speed_max', 'volume', 'old_driver', 'link_no', 'link_time_sum', 'link_time_avg', 'link_time_std', 'link_time_max', 'link_time_min', 'link_no_0', 'link_no_1', 'link_no_2', 'link_no_3', 'link_no_4', 'link_time_sum_0', 'link_time_sum_1', 'link_time_sum_2', 'link_time_sum_3', 'link_time_sum_4', 'link_time_avg_0', 'link_time_avg_1', 'link_time_avg_2', 'link_time_avg_3', 'link_time_avg_4', 'link_time_std_0', 'link_time_std_1', 'link_time_std_2', 'link_time_std_3', 'link_time_std_4', 'link_time_max_0', 'link_time_max_1', 'link_time_max_2', 'link_time_max_3', 'link_time_max_4', 'link_time_min_0', 'link_time_min_1', 'link_time_min_2', 'link_time_min_3', 'link_time_min_4', 'next_no_1.0', 'next_no_2.0', 'next_no_3.0', 'next_no_4.0', 'next_time_sum_1.0', 'next_time_sum_2.0', 'next_time_sum_3.0', 'next_time_sum_4.0', 'next_time_avg_1.0', 'next_time_avg_2.0', 'next_time_avg_3.0', 'next_time_avg_4.0', 'next_ti

In [7]:
     
lgb_paras = {
    'objective': 'regression',
    'metrics': 'rmse',
    'learning_rate': 0.1,
#     'num_leaves': 100,
    'max_depth': -1,
    'lambda_l1': 0.01,
    'lambda_l2': 0.02,
    'seed': 416,
    'feature_fraction': 1,
    'bagging_fraction': 0.8,
    'boost_from_average':'false',
    'bagging_freq': 3,
    'num_threads': 6,

    'verbose':-1,
    'device': 'gpu'
}

In [8]:
days = train['date'].unique()
tr_days,val_days = train_test_split(days,test_size=0.2,random_state=416)
tr_x = train_x[train['date'].isin(tr_days)]
tr_y = train_y[train['date'].isin(tr_days)]
val_x = train_x[train['date'].isin(val_days)]
val_y = train_y[train['date'].isin(val_days)]
# tr_x, val_x, tr_y, val_y = train_test_split(train_x,train_y,test_size=0.2,random_state=416)
train_set = lgb.Dataset(tr_x, tr_y,categorical_feature=cat_cols)
val_set = lgb.Dataset(val_x, val_y,categorical_feature=cat_cols)
lgb_model = lgb.train(lgb_paras, train_set, categorical_feature=cat_cols,
                      valid_sets=[train_set,val_set], early_stopping_rounds=100, num_boost_round=1000, verbose_eval=100)

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.18499	valid_1's rmse: 0.18308
[200]	training's rmse: 0.183645	valid_1's rmse: 0.181972
[300]	training's rmse: 0.182984	valid_1's rmse: 0.181553
[400]	training's rmse: 0.182523	valid_1's rmse: 0.181268
[500]	training's rmse: 0.182152	valid_1's rmse: 0.181094
[600]	training's rmse: 0.181816	valid_1's rmse: 0.180936
[700]	training's rmse: 0.181519	valid_1's rmse: 0.180819
[800]	training's rmse: 0.181239	valid_1's rmse: 0.180719
[900]	training's rmse: 0.180972	valid_1's rmse: 0.180623
[1000]	training's rmse: 0.180701	valid_1's rmse: 0.180501
[1100]	training's rmse: 0.180455	valid_1's rmse: 0.180431
[1200]	training's rmse: 0.180232	valid_1's rmse: 0.180361
[1300]	training's rmse: 0.180006	valid_1's rmse: 0.180295
[1400]	training's rmse: 0.179789	valid_1's rmse: 0.180236
[1500]	training's rmse: 0.179582	valid_1's rmse: 0.180187
[1600]	training's rmse: 0.179394	valid_1's rmse: 0.180144
[1700]	training's rms

In [9]:
feature_importance_df = pd.DataFrame()
feature_importance_df["feature"] = features
feature_importance_df["importance"] = lgb_model.feature_importance()
feature_importance_df.sort_values('importance')

Unnamed: 0,feature,importance
49,next_no_4.0,429
19,link_no_3,544
18,link_no_2,544
20,link_no_4,561
82,weekday,739
16,link_no_0,913
30,link_time_avg_4,933
25,link_time_sum_4,986
24,link_time_sum_3,987
35,link_time_std_4,1044


In [10]:
pred_y = lgb_model.predict(test_x, num_iteration=lgb_model.best_iteration)

# result_lgb = (pred_proba>thresholds[ix]).astype('int')

In [11]:
submit = pd.DataFrame(columns=['id','result'])
submit['id'] = test['order_id']
submit['result'] = np.exp(pred_y).astype('int')

In [12]:
submit.to_csv('submission.csv',index=False)

In [None]:
plt.bar(feature_importance_df['feature'],feature_importance_df['importance'])