In [1]:
import random
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

path = os.getcwd()
file_path = os.path.join(path,"file")
sub_path = os.path.join(path,"제출")
train_df = pd.read_csv(os.path.join(file_path,"train.csv"))
test_df = pd.read_csv(os.path.join(file_path,"test.csv"))

In [2]:
display(train_df.head())

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


In [3]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [4]:
from pytimekr import pytimekr
import datetime

List_holiday = []
for i in range(5,1551,7):
    List_holiday.append((datetime.datetime(2019, 1, 1)+datetime.timedelta(i)).strftime("%Y-%m-%d"))
    
for i in range(2019,2024):
    List_holiday.extend(pytimekr.holidays(year=i)) #리스트 형태로 반환

for i in range(len(List_holiday)):
    List_holiday[i] = str(List_holiday[i])
np.sort(List_holiday)

array(['2019-01-01', '2019-01-06', '2019-01-13', '2019-01-20',
       '2019-01-27', '2019-02-03', '2019-02-04', '2019-02-05',
       '2019-02-06', '2019-02-10', '2019-02-17', '2019-02-24',
       '2019-03-01', '2019-03-03', '2019-03-10', '2019-03-17',
       '2019-03-24', '2019-03-31', '2019-04-07', '2019-04-14',
       '2019-04-21', '2019-04-28', '2019-05-05', '2019-05-05',
       '2019-05-12', '2019-05-12', '2019-05-19', '2019-05-26',
       '2019-06-02', '2019-06-06', '2019-06-09', '2019-06-16',
       '2019-06-23', '2019-06-30', '2019-07-07', '2019-07-14',
       '2019-07-21', '2019-07-28', '2019-08-04', '2019-08-11',
       '2019-08-15', '2019-08-18', '2019-08-25', '2019-09-01',
       '2019-09-08', '2019-09-12', '2019-09-13', '2019-09-14',
       '2019-09-15', '2019-09-22', '2019-09-29', '2019-10-03',
       '2019-10-06', '2019-10-09', '2019-10-13', '2019-10-20',
       '2019-10-27', '2019-11-03', '2019-11-10', '2019-11-17',
       '2019-11-24', '2019-12-01', '2019-12-08', '2019-

In [5]:
train_df.loc[train_df['timestamp'].isin(List_holiday)==True,'Holiday']=0
train_df.loc[train_df['timestamp'].isin(List_holiday)==False,'Holiday']=1
test_df.loc[test_df['timestamp'].isin(List_holiday)==True,'Holiday']=0
test_df.loc[test_df['timestamp'].isin(List_holiday)==False,'Holiday']=1

In [6]:
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

test_x = test_df.drop(columns=['ID', 'timestamp'])

In [7]:
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [8]:
# def minmaxscaler(data,Min=[],Max=[]):
#     if len(Min)==0 and len(Max)==0:
#         return (data-np.min(data,axis=0))/(np.max(data,axis=0)-np.min(data,axis=0)), np.min(data,axis=0),np.max(data,axis=0)
#     else:
#         return (data-Min)/(Max-Min)
# def standardscaler(data,Mean=[],Std=[]):
#     if len(Mean)==0 and len(Std)==0:
#         return (data-np.mean(data,axis=0))/np.std(data,axis=0), np.mean(data,axis=0),np.std(data,axis=0)
#     else:
#         return (data - Mean) / Std
# re_train_x,Mean_train,Std_train = standardscaler(train_x)
# re_test_x = standardscaler(test_x,Mean_train,Std_train)

In [9]:
model = XGBRegressor(gamma=0,importance_type='gain',random_state=0, max_depth=100 ,n_estimators=2000)
model.fit(train_x, train_y, eval_metric = 'rmse')



In [10]:
preds = model.predict(test_x)

In [11]:
print(preds[:30])

[ 3.3873621e+03 -2.4997169e-04  3.5485964e+03  3.5483738e+03
  2.6472339e+03  3.3299998e+03  3.5720002e+03  3.3210000e+03
 -8.5053776e-05  3.6669612e+03  3.5169995e+03  2.6840007e+03
  2.7269990e+03  5.4070015e+03  7.5129980e+03  1.5511273e-03
  4.7115762e+03  4.6730005e+03  5.5779990e+03  5.0939985e+03
  4.7890000e+03  1.0043997e+04 -2.0094674e-05  7.1567568e+03
  7.2070005e+03  6.5899995e+03  6.2490000e+03  5.2389995e+03
  4.3902275e+03  5.8806926e-04]


In [12]:
submission = pd.read_csv(os.path.join(file_path,"sample_submission.csv"))
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [13]:
holiday_test = ["20230305","20230312","20230319","20230326"]
submission['answer'] = preds
submission.loc[submission['ID'].str.split("_").str[3].isin(holiday_test),"answer"] = 0
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3387.362061
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3548.596436
3,TG_A_J_20230307,3548.373779
4,TG_A_J_20230308,2647.233887
...,...,...
1087,RD_F_J_20230327,427.617584
1088,RD_F_J_20230328,439.709076
1089,RD_F_J_20230329,416.610504
1090,RD_F_J_20230330,397.910187


In [14]:
submission.to_csv(os.path.join(sub_path,"baseline_submission13.csv"), index=False)