## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0


In [5]:
test_df

Unnamed: 0,ID,timestamp,item,corporation,location
0,TG_A_J_20230304,2023-03-04,TG,A,J
1,TG_A_J_20230305,2023-03-05,TG,A,J
2,TG_A_J_20230306,2023-03-06,TG,A,J
3,TG_A_J_20230307,2023-03-07,TG,A,J
4,TG_A_J_20230308,2023-03-08,TG,A,J
...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J
1088,RD_F_J_20230328,2023-03-28,RD,F,J
1089,RD_F_J_20230329,2023-03-29,RD,F,J
1090,RD_F_J_20230330,2023-03-30,RD,F,J


## Data Pre-Processing

In [6]:
import holidays
import pandas as pd

def add_weekday(df) :
    df['date'] = pd.to_datetime(df['timestamp'])
    df['weekcode'] = df['date'].dt.weekday

    year_min = df['date'].dt.date.unique().min().year
    year_max = df['date'].dt.date.unique().max().year
    kor_holidays = list(holidays.KOR(years=range(year_min, year_max)).keys())
    idx_kor_holidays = pd.to_datetime(kor_holidays)

    df_temp = df[['date', 'weekcode']].copy()
    df_temp.set_index('date', inplace=True)
    #공휴일은 10으로 표시
    df_temp.loc[idx_kor_holidays, 'weekcode'] = 10
    df_temp.reset_index(inplace=True)

    df['weekcode'] = df_temp['weekcode']

    return df

In [7]:
train_df = add_weekday(train_df)
test_df = add_weekday(test_df)

In [8]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),date,weekcode
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019-01-01,10
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019-01-02,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019-01-03,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019-01-04,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019-01-05,5
...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023-02-27,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023-02-28,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023-03-01,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023-03-02,3


In [9]:
def split_time_stamp(train_df, test_df):
    #시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
    train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
    train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
    train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

    test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
    test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
    test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

    return train_df, test_df

In [10]:
train_df, test_df = split_time_stamp(train_df, test_df)

test_df

Unnamed: 0,ID,timestamp,item,corporation,location,date,weekcode,year,month,day
0,TG_A_J_20230304,2023-03-04,TG,A,J,2023-03-04,5,2023,3,4
1,TG_A_J_20230305,2023-03-05,TG,A,J,2023-03-05,6,2023,3,5
2,TG_A_J_20230306,2023-03-06,TG,A,J,2023-03-06,0,2023,3,6
3,TG_A_J_20230307,2023-03-07,TG,A,J,2023-03-07,1,2023,3,7
4,TG_A_J_20230308,2023-03-08,TG,A,J,2023-03-08,2,2023,3,8
...,...,...,...,...,...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J,2023-03-27,0,2023,3,27
1088,RD_F_J_20230328,2023-03-28,RD,F,J,2023-03-28,1,2023,3,28
1089,RD_F_J_20230329,2023-03-29,RD,F,J,2023-03-29,2,2023,3,29
1090,RD_F_J_20230330,2023-03-30,RD,F,J,2023-03-30,3,2023,3,30


In [11]:
#학습에 사용하지 않을 변수들을 제거합니다
train_df_mod = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'date'])
test_x = test_df.drop(columns=['ID', 'timestamp', 'date'])

In [12]:
from sklearn.model_selection import train_test_split

train_set, val_set = train_test_split(train_df_mod, test_size=0.2, random_state=42)

In [13]:
train_x = train_set.drop(columns='price(원/kg)')
train_y = train_set['price(원/kg)']

val_x = val_set.drop(columns='price(원/kg)')
val_y = val_set['price(원/kg)']

In [14]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    val_x[i]=le.transform(val_x[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [15]:
train_x

Unnamed: 0,item,corporation,location,weekcode,year,month,day
3607,4,1,0,2,2020,7,15
28882,1,0,1,6,2023,1,8
40072,3,4,0,6,2020,4,19
29223,1,3,0,0,2019,10,14
50088,0,2,0,2,2022,9,14
...,...,...,...,...,...,...,...
54343,0,4,0,3,2021,11,4
38158,3,3,1,0,2019,3,25
860,4,0,0,0,2021,5,10
15795,2,0,0,6,2020,7,19


## Regression Model Fit

In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada_reg = AdaBoostRegressor(
 DecisionTreeRegressor(max_depth=15), n_estimators=100,
 loss="square", learning_rate=0.5, random_state=42)
ada_reg.fit(train_x, train_y)

In [17]:
from sklearn.metrics import mean_squared_error

val_ada_predict = ada_reg.predict(val_x)
mse_ada = mean_squared_error(val_ada_predict, val_y)
rmse_ada = np.sqrt(mse_ada)
rmse_ada

895.2888647085267

## Inference

In [18]:
preds = ada_reg.predict(test_x)

## Submission

In [19]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [20]:
submission['answer'] = preds
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3870.000000
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3542.000000
3,TG_A_J_20230307,3603.555556
4,TG_A_J_20230308,3558.000000
...,...,...
1087,RD_F_J_20230327,476.722222
1088,RD_F_J_20230328,475.000000
1089,RD_F_J_20230329,471.777778
1090,RD_F_J_20230330,440.000000


In [21]:
submission.to_csv('./baseline_submission.csv', index=False)