In [1]:
import pandas as pd
import numpy as np 
from pycaret.regression import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 원본 데이터셋 로드
dir_dataset = "C:/Users/0stix/Datasets/"
name_project = '2203-kaggle-tps2203'
df_train = pd.read_csv(dir_dataset+name_project+'/train.csv')
df_test = pd.read_csv(dir_dataset+name_project+'/test.csv')
df_sub = pd.read_csv(dir_dataset+name_project+'/sample_submission.csv')

len_train = len(df_train)
df_all = pd.concat([df_train, df_test], axis=0)
target = 'congestion'

In [3]:
train = pd.read_csv(dir_dataset+name_project+'/train.csv' ,index_col='row_id')
test = pd.read_csv(dir_dataset+name_project+'/test.csv' ,index_col='row_id')

In [4]:
train['time'] = pd.to_datetime(train.time)
# delete official holiday                                                 
train['offical_holiday'] = train.time.dt.date.astype(str).str.contains('1991-05-27|1991-07-04|1991-09-02').astype('int')
train=train[train['offical_holiday']==0]
train=train.drop('offical_holiday',axis=1)
# train on data for Mondays to Thursdays and complete months only
train=train[(train.time.dt.weekday< 4) & (train.time.dt.month > 4)] 

In [5]:
def pre_process(df): 
    df['time'] = pd.to_datetime(df['time'])
    df['month']= df.time.dt.month
    df['day']= df.time.dt.dayofyear
    df['am'] = (df.time.dt.hour < 12) & (df.time.dt.hour >6)
    df['wkday'] = df.time.dt.weekday
    df['time']= (df.time.dt.hour-12)*3+df.time.dt.minute/20
    df['xydirday']= df.x.astype(str)+df.y.astype(str)+df.direction+df.day.astype(str)
    df['xydir'] = df.x.astype(str)+df.y.astype(str)+df.direction
    df['all']= df['xydir']+df.time.astype(str)

In [6]:
pre_process(train)
pre_process(test)

In [42]:
train.head()

Unnamed: 0_level_0,time,x,y,direction,congestion,month,day,am,wkday,xydirday,xydir,all,avg
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
142220,0.0,0,0,EB,27,5,121,False,2,00EB121,00EB,00EB0.0,50.0
142221,0.0,0,0,NB,24,5,121,False,2,00NB121,00NB,00NB0.0,35.0
142222,0.0,0,0,SB,52,5,121,False,2,00SB121,00SB,00SB0.0,55.0
142223,0.0,0,1,EB,27,5,121,False,2,01EB121,01EB,01EB0.0,26.0
142224,0.0,0,1,NB,72,5,121,False,2,01NB121,01NB,01NB0.0,72.0


In [7]:
# calculate average traffic flow for each time of day 
mapper_avg = train[['all','congestion']].groupby(['all']).median().to_dict()['congestion']

In [8]:
train['avg']= train['all'].map(mapper_avg)
test['avg']= test['all'].map(mapper_avg)
train= train[train.time >=0]

In [43]:
train.shape, test.shape

((193700, 13), (2340, 13))

In [60]:
reg = setup(data = train,
            target = 'congestion',
            session_id=999,
            data_split_shuffle = True, 
            create_clusters = False,
            fold_strategy = 'groupkfold',
            fold_groups = 'wkday',
            use_gpu = False,
            silent = True,
            fold=4,
            ignore_features = ['all','day','xydirday'],
            n_jobs = -1)

Unnamed: 0,Description,Value
0,session_id,999
1,Target,congestion
2,Original Data,"(193700, 13)"
3,Missing Values,0
4,Numeric Features,2
5,Categorical Features,7
6,Ordinal Features,0
7,High Cardinality Features,0
8,High Cardinality Method,
9,Transformed Train Set,"(135589, 92)"


In [61]:
top4 = compare_models(sort = 'MAE', n_select=4, exclude = [
    # 'lar',  'rf', 'et', 'gbr', 
                                                           # 'xgboost'
                                                          ])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,5.4519,58.9768,7.6796,0.7899,0.1971,0.1286,3.72
lightgbm,Light Gradient Boosting Machine,5.47,59.6567,7.7237,0.7875,0.1982,0.1292,0.305
xgboost,Extreme Gradient Boosting,5.4825,59.6317,7.7221,0.7876,0.198,0.1294,3.735
huber,Huber Regressor,5.4834,61.4006,7.8358,0.7813,0.1997,0.1273,6.5175
gbr,Gradient Boosting Regressor,5.5114,60.7048,7.7913,0.7838,0.1996,0.1301,4.9775
lasso,Lasso Regression,5.521,61.8099,7.8619,0.7798,0.201,0.1293,0.6775
en,Elastic Net,5.521,61.8098,7.8619,0.7798,0.201,0.1293,0.145
omp,Orthogonal Matching Pursuit,5.5239,61.1793,7.8217,0.7821,0.2002,0.13,0.135
br,Bayesian Ridge,5.5272,61.0999,7.8166,0.7824,0.2001,0.1302,1.2675
ridge,Ridge Regression,5.5338,61.1029,7.8168,0.7824,0.2001,0.1304,0.6725


In [31]:
blender = blend_models(top5)

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CV-Val,0,5.4337,59.281,7.6994,0.7863,0.1949,0.1266
CV-Val,1,5.4593,59.2895,7.7,0.7892,0.1956,0.1295
CV-Val,2,5.4306,59.0668,7.6855,0.7889,0.1929,0.1254
CV-Val,3,5.4647,59.9624,7.7435,0.7893,0.2064,0.131
CV-Val,Mean,5.447,59.3999,7.7071,0.7884,0.1975,0.1281
CV-Val,Std,0.0151,0.3368,0.0218,0.0013,0.0052,0.0022
Train,,5.3712,57.5185,7.5841,0.7951,0.1945,0.1261


In [32]:
final = finalize_model(blender)

In [33]:
test['pred'] = (predict_model(final, data=test)['Label']).round()

# Idea from https://www.kaggle.com/code/ambrosm/tpsmar22-generalizing-the-special-values

sep = train[(train.day >= 246) & (train.time >= 0)]
lower = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.7).values

test.pred = test.pred.clip(lower, upper)

In [34]:
# for roadways with low value count replace prediction with nearest value from the training data

for xydir in set(test.xydir):
    
    xydir_counts = train.loc[train.xydir ==xydir,'congestion'].value_counts()
    
    l = xydir_counts[(xydir_counts > 200)] # experimental
    if len(l) > 2: # experimental
        l = list(l.index)
        test.loc[test.xydir ==xydir,'pred']= test.loc[test.xydir ==xydir,'pred'].map(lambda y: min(l, key=lambda x:abs(x-y)))

In [35]:
sub = pd.DataFrame(list(zip(test.index,test.pred.values)),columns = ['row_id', 'congestion'])
sub.to_csv('submission.csv', index = False)
sub.head()

Unnamed: 0,row_id,congestion
0,848835,50.0
1,848836,35.0
2,848837,53.0
3,848838,26.0
4,848839,71.0


In [39]:
top5

[<catboost.core.CatBoostRegressor at 0x10ea921ad60>,
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=999, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='reg:squarederror',

In [53]:
reg[8].head()

AttributeError: 'int' object has no attribute 'head'

In [55]:
for each_ in reg:
    print(type(each_))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'NoneType'>
<class 'dict'>
<class 'str'>
<class 'int'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'int'>
<class 'int'>
<class 'dict'>
<class 'NoneType'>
<class 'bool'>
<class 'list'>
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
<class 'bool'>
<class 'int'>
<class 'pandas.core.frame.DataFrame'>
<class 'str'>
<class 'bool'>
<class 'sklearn.model_selection._split.GroupKFold'>
<class 'pandas.core.frame.DataFrame'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'dict'>
<class 'bool'>
<class 'bool'>
<class 'pandas.core.series.Series'>
<class 'str'>
<class 'bool'>
<class 'pycaret.internal.pipeline.Pipeline'>
<class 'str'>
<enum 'MLUsecase'>
<class 'int'>
<class 'bool'>
<class 'sklearn.pipeline.Pipeline'>
<class 'str'>
<class 'dict'>
<class 'bool'>
<class 'pandas.core.series.Series'>
<class 'set'>


In [58]:
reg[7].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193700 entries, 142220 to 832454
Data columns (total 92 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   time          193700 non-null  float32
 1   avg           193700 non-null  float32
 2   x_0           193700 non-null  float32
 3   x_1           193700 non-null  float32
 4   x_2           193700 non-null  float32
 5   y_0           193700 non-null  float32
 6   y_1           193700 non-null  float32
 7   y_2           193700 non-null  float32
 8   y_3           193700 non-null  float32
 9   direction_EB  193700 non-null  float32
 10  direction_NB  193700 non-null  float32
 11  direction_NE  193700 non-null  float32
 12  direction_NW  193700 non-null  float32
 13  direction_SB  193700 non-null  float32
 14  direction_SE  193700 non-null  float32
 15  direction_SW  193700 non-null  float32
 16  direction_WB  193700 non-null  float32
 17  month_5       193700 non-null  float32
 18 

In [59]:
reg[7].head()

Unnamed: 0_level_0,time,avg,x_0,x_1,x_2,y_0,y_1,y_2,y_3,direction_EB,...,xydir_22SB,xydir_22SE,xydir_22SW,xydir_22WB,xydir_23EB,xydir_23NB,xydir_23NE,xydir_23SB,xydir_23SW,xydir_23WB
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
142220,0.0,50.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142221,0.0,35.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142222,0.0,55.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142223,0.0,26.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142224,0.0,72.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
