In [1]:
# Notebook 출력설정
# 주요 라이브러리 임포트

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns',2000)
pd.set_option('display.width', 2000)

%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams["figure.figsize"] = (15,5)
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['axes.grid'] = True
import seaborn as sns

In [11]:
# 원본 데이터셋 로드
DIR_DATASET = "C:/Users/0stix/Datasets/"
NAME_PROJECT = "2203-kaggle-tps2203"

In [12]:
df_train = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/train.csv', index_col='row_id')
df_test = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/test.csv', index_col='row_id')
df_sub = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/sample_submission.csv', index_col='row_id')

train = df_train
test = df_test

In [13]:
from pycaret.regression import *

In [14]:
import time

train['time'] = pd.to_datetime(train.time)
# delete official holiday                                                 
train['offical_holiday'] = train.time.dt.date.astype(str).str.contains('1991-05-27|1991-07-04|1991-09-02').astype('int')
train=train[train['offical_holiday']==0]
train=train.drop('offical_holiday',axis=1)
# train on data for Mondays to Thursdays and complete months only
train=train[(train.time.dt.weekday< 4) & (train.time.dt.month > 4)]     

def pre_process(df): 
    df['time'] = pd.to_datetime(df['time'])
    df['month']= df.time.dt.month
    df['day']= df.time.dt.dayofyear
    df['am'] = (df.time.dt.hour < 12) & (df.time.dt.hour >6)
    df['wkday'] = df.time.dt.weekday
    df['time']= (df.time.dt.hour-12)*3+df.time.dt.minute/20
    df['xydirday']= df.x.astype(str)+df.y.astype(str)+df.direction+df.day.astype(str)
    df['xydir'] = df.x.astype(str)+df.y.astype(str)+df.direction
    df['all']= df['xydir']+df.time.astype(str)
    
pre_process(train)
pre_process(test)

# calculate average traffic flow for each time of day 
mapper_avg = train[['all','congestion']].groupby(['all']).median().to_dict()['congestion']

train['avg']= train['all'].map(mapper_avg)
test['avg']= test['all'].map(mapper_avg)
train= train[train.time >=0]

In [15]:
reg = setup(data = train,
            target = 'congestion',
            session_id=999,
            data_split_shuffle = True, 
            create_clusters = False,
            fold_strategy = 'groupkfold',
            fold_groups = 'wkday',
            use_gpu = False,
            silent = True,
            fold=4,
            ignore_features = ['all','day','xydirday'],
            n_jobs = -1)

Unnamed: 0,Description,Value
0,session_id,999
1,Target,congestion
2,Original Data,"(193700, 13)"
3,Missing Values,0
4,Numeric Features,2
5,Categorical Features,7
6,Ordinal Features,0
7,High Cardinality Features,0
8,High Cardinality Method,
9,Transformed Train Set,"(135589, 92)"


In [16]:
top_ = compare_models(sort = 'MAE', n_select=4, exclude = ['lar',  'rf', 'et', 'gbr'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,5.4519,58.9768,7.6796,0.7899,0.1971,0.1286,3.8325
lightgbm,Light Gradient Boosting Machine,5.47,59.6567,7.7237,0.7875,0.1982,0.1292,0.305
xgboost,Extreme Gradient Boosting,5.4825,59.6317,7.7221,0.7876,0.198,0.1294,4.445
huber,Huber Regressor,5.4834,61.4006,7.8358,0.7813,0.1997,0.1273,6.9575
lasso,Lasso Regression,5.521,61.8099,7.8619,0.7798,0.201,0.1293,0.64
en,Elastic Net,5.521,61.8098,7.8619,0.7798,0.201,0.1293,0.12
omp,Orthogonal Matching Pursuit,5.5239,61.1793,7.8217,0.7821,0.2002,0.13,0.1475
br,Bayesian Ridge,5.5272,61.0999,7.8166,0.7824,0.2001,0.1302,1.3075
ridge,Ridge Regression,5.5338,61.1029,7.8168,0.7824,0.2001,0.1304,0.6675
lr,Linear Regression,5.5339,61.1029,7.8168,0.7824,0.2001,0.1304,1.5625


In [27]:
tuned_ = []
for each_top in top_:
    tuned_.append(tune_model(each_top, n_iter=100, optimize='MAE'))

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CV-Val,0,5.4555,62.0647,7.8781,0.7762,0.1982,0.1247
CV-Val,1,5.4811,62.0065,7.8744,0.7796,0.1987,0.1273
CV-Val,2,5.4417,61.5825,7.8475,0.7799,0.1956,0.1229
CV-Val,3,5.4689,62.2658,7.8909,0.7812,0.2088,0.1283
CV-Val,Mean,5.4618,61.9799,7.8727,0.7792,0.2003,0.1258
CV-Val,Std,0.0147,0.2488,0.0158,0.0018,0.005,0.0021
Train,,5.4615,61.9821,7.8729,0.7792,0.2002,0.1257


In [28]:
blended_ = blend_models(tuned_)

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CV-Val,0,5.4165,59.0083,7.6817,0.7873,0.1945,0.1259
CV-Val,1,5.4445,59.0208,7.6825,0.7902,0.1951,0.1288
CV-Val,2,5.4152,58.7781,7.6667,0.79,0.1923,0.1247
CV-Val,3,5.4505,59.7064,7.727,0.7902,0.2059,0.1303
CV-Val,Mean,5.4317,59.1284,7.6895,0.7894,0.197,0.1274
CV-Val,Std,0.016,0.3474,0.0226,0.0012,0.0053,0.0023
Train,,5.3087,56.1164,7.4911,0.8001,0.1924,0.1243


In [29]:
finalized_ = finalize_model(blended_)

In [30]:
test['pred'] = (predict_model(finalized_, data=test)['Label']).round()

# Idea from https://www.kaggle.com/code/ambrosm/tpsmar22-generalizing-the-special-values

sep = train[(train.day >= 246) & (train.time >= 0)]
lower = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.7).values

test.pred = test.pred.clip(lower, upper)

# for roadways with low value count replace prediction with nearest value from the training data

for xydir in set(test.xydir):
    
    xydir_counts = train.loc[train.xydir ==xydir,'congestion'].value_counts()
    
    l = xydir_counts[(xydir_counts > 200)] # experimental
    if len(l) > 2: # experimental
        l = list(l.index)
        test.loc[test.xydir ==xydir,'pred']= test.loc[test.xydir ==xydir,'pred'].map(lambda y: min(l, key=lambda x:abs(x-y)))

In [31]:
sub = pd.DataFrame(list(zip(test.index,test.pred.values)),columns = ['row_id', 'congestion'])

import datetime
now = datetime.datetime.now()
str_datetime = now.strftime("%y%m%d_%H%M%S")
sub.to_csv(DIR_DATASET+'submission-'+NAME_PROJECT+'-'+str_datetime+'.csv', index = 0)
sub.head()

Unnamed: 0,row_id,congestion
0,848835,50.0
1,848836,35.0
2,848837,53.0
3,848838,26.0
4,848839,71.0
