## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0


In [5]:
test_df

Unnamed: 0,ID,timestamp,item,corporation,location
0,TG_A_J_20230304,2023-03-04,TG,A,J
1,TG_A_J_20230305,2023-03-05,TG,A,J
2,TG_A_J_20230306,2023-03-06,TG,A,J
3,TG_A_J_20230307,2023-03-07,TG,A,J
4,TG_A_J_20230308,2023-03-08,TG,A,J
...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J
1088,RD_F_J_20230328,2023-03-28,RD,F,J
1089,RD_F_J_20230329,2023-03-29,RD,F,J
1090,RD_F_J_20230330,2023-03-30,RD,F,J


## Data Pre-Processing

In [6]:
import holidays
import pandas as pd

def add_weekday(df) :
    df['date'] = pd.to_datetime(df['timestamp'])
    df['weekcode'] = df['date'].dt.weekday

    year_min = df['date'].dt.date.unique().min().year
    year_max = df['date'].dt.date.unique().max().year
    kor_holidays = list(holidays.KOR(years=range(year_min, year_max)).keys())
    idx_kor_holidays = pd.to_datetime(kor_holidays)

    df_temp = df[['date', 'weekcode']].copy()
    df_temp.set_index('date', inplace=True)
    #공휴일은 10으로 표시
    df_temp.loc[idx_kor_holidays, 'weekcode'] = 10
    df_temp.reset_index(inplace=True)

    df['weekcode'] = df_temp['weekcode']

    return df

In [7]:
train_df = add_weekday(train_df)
test_df = add_weekday(test_df)

In [8]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),date,weekcode
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019-01-01,10
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019-01-02,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019-01-03,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019-01-04,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019-01-05,5
...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023-02-27,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023-02-28,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023-03-01,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023-03-02,3


In [9]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),date,weekcode
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019-01-01,10
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019-01-02,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019-01-03,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019-01-04,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019-01-05,5
...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023-02-27,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023-02-28,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023-03-01,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023-03-02,3


In [10]:
def split_time_stamp(train_df, test_df):
    #시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
    train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
    train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
    train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

    test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
    test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
    test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

    return train_df, test_df

In [11]:
train_df, test_df = split_time_stamp(train_df, test_df)

test_df

Unnamed: 0,ID,timestamp,item,corporation,location,date,weekcode,year,month,day
0,TG_A_J_20230304,2023-03-04,TG,A,J,2023-03-04,5,2023,3,4
1,TG_A_J_20230305,2023-03-05,TG,A,J,2023-03-05,6,2023,3,5
2,TG_A_J_20230306,2023-03-06,TG,A,J,2023-03-06,0,2023,3,6
3,TG_A_J_20230307,2023-03-07,TG,A,J,2023-03-07,1,2023,3,7
4,TG_A_J_20230308,2023-03-08,TG,A,J,2023-03-08,2,2023,3,8
...,...,...,...,...,...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J,2023-03-27,0,2023,3,27
1088,RD_F_J_20230328,2023-03-28,RD,F,J,2023-03-28,1,2023,3,28
1089,RD_F_J_20230329,2023-03-29,RD,F,J,2023-03-29,2,2023,3,29
1090,RD_F_J_20230330,2023-03-30,RD,F,J,2023-03-30,3,2023,3,30


In [12]:
train_df_sorted_by_date = train_df.sort_values(by='date')
train_df_sorted_by_date

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),date,weekcode,year,month,day
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019-01-01,10,2019,1,1
27414,CB_A_S_20190101,2019-01-01,CB,A,S,0.0,0.0,2019-01-01,10,2019,1,1
36552,RD_D_J_20190101,2019-01-01,RD,D,J,0.0,0.0,2019-01-01,10,2019,1,1
51782,BC_D_J_20190101,2019-01-01,BC,D,J,0.0,0.0,2019-01-01,10,2019,1,1
56351,CB_F_J_20190101,2019-01-01,CB,F,J,0.0,0.0,2019-01-01,10,2019,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
25890,CR_E_S_20230303,2023-03-03,CR,E,S,0.0,0.0,2023-03-03,4,2023,3,3
45689,BC_A_S_20230303,2023-03-03,BC,A,S,3776.0,2875.0,2023-03-03,4,2023,3,3
31982,CB_E_J_20230303,2023-03-03,CB,E,J,0.0,0.0,2023-03-03,4,2023,3,3
53304,BC_D_J_20230303,2023-03-03,BC,D,J,1776.0,3059.0,2023-03-03,4,2023,3,3


In [13]:
#학습에 사용하지 않을 변수들을 제거합니다
train_df_mod = train_df_sorted_by_date.drop(columns=['ID', 'timestamp', 'supply(kg)', 'date'])
test_x = test_df.drop(columns=['ID', 'timestamp', 'date'])

In [14]:
train_df_mod

Unnamed: 0,item,corporation,location,price(원/kg),weekcode,year,month,day
0,TG,A,J,0.0,10,2019,1,1
27414,CB,A,S,0.0,10,2019,1,1
36552,RD,D,J,0.0,10,2019,1,1
51782,BC,D,J,0.0,10,2019,1,1
56351,CB,F,J,0.0,10,2019,1,1
...,...,...,...,...,...,...,...,...
25890,CR,E,S,0.0,4,2023,3,3
45689,BC,A,S,2875.0,4,2023,3,3
31982,CB,E,J,0.0,4,2023,3,3
53304,BC,D,J,3059.0,4,2023,3,3


In [15]:
from sklearn.model_selection import TimeSeriesSplit

In [16]:
tscv = TimeSeriesSplit(n_splits=5)

for train, test in tscv.split(train_df_mod):
    print("%s %s" % (train, test))

[   0    1    2 ... 9899 9900 9901] [ 9902  9903  9904 ... 19798 19799 19800]
[    0     1     2 ... 19798 19799 19800] [19801 19802 19803 ... 29697 29698 29699]
[    0     1     2 ... 29697 29698 29699] [29700 29701 29702 ... 39596 39597 39598]
[    0     1     2 ... 39596 39597 39598] [39599 39600 39601 ... 49495 49496 49497]
[    0     1     2 ... 49495 49496 49497] [49498 49499 49500 ... 59394 59395 59396]


In [17]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_df_mod[i]=le.fit_transform(train_df_mod[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [18]:
train_x = train_df_mod.drop(columns='price(원/kg)')
train_y = train_df_mod['price(원/kg)']


In [19]:
train_x

Unnamed: 0,item,corporation,location,weekcode,year,month,day
0,4,0,0,10,2019,1,1
27414,1,0,1,10,2019,1,1
36552,3,3,0,10,2019,1,1
51782,0,3,0,10,2019,1,1
56351,1,5,0,10,2019,1,1
...,...,...,...,...,...,...,...
25890,2,4,1,4,2023,3,3
45689,0,0,1,4,2023,3,3
31982,1,4,0,4,2023,3,3
53304,0,3,0,4,2023,3,3


In [20]:
test_x

Unnamed: 0,item,corporation,location,weekcode,year,month,day
0,4,0,0,5,2023,3,4
1,4,0,0,6,2023,3,5
2,4,0,0,0,2023,3,6
3,4,0,0,1,2023,3,7
4,4,0,0,2,2023,3,8
...,...,...,...,...,...,...,...
1087,3,5,0,0,2023,3,27
1088,3,5,0,1,2023,3,28
1089,3,5,0,2,2023,3,29
1090,3,5,0,3,2023,3,30


## Regression Model Fit

In [21]:
params_lgbm={'bagging_freq': 1, 'verbosity': -1, 'seed': 42, 'feature_pre_filter': False,\
             'objective': 'regression', 'metric': 'rmse', 'n_estimators':50,
        'bagging_fraction': 0.8, 'feature_fraction': 0.5, 'learning_rate': 0.05, 'num_iterations':500,\
        'max_depth': 20, 'min_child_samples': 30, 'num_leaves': 50, 'subsample': 0.5}

In [22]:
import lightgbm as lgb

lgb_reg = lgb.LGBMRegressor(**params_lgbm)

In [24]:
lgb_reg.fit(train_x, train_y)



## 모델 학습


In [23]:
grid_lgbm = {
    "max_depth": [10, 15,20, 30,50],
    "learning_rate": [0.01, 0.05, 0.1, 0.3, 0.5],
    "num_leaves":[50, 70, 100,150, 200],
    'subsample': [0.5, 0.7, 1.0],
    "n_estimators" : [100, 200, 300, 500, 1000],
    'min_child_samples' : [10,15, 20, 30]
}

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
gscv_lgbm = GridSearchCV (estimator = lgb_reg, param_grid = grid_lgbm, scoring ='neg_mean_squared_error', cv = tscv, refit=True, n_jobs=10, verbose=2)

gscv_lgbm.fit(train_x, train_y)

Fitting 5 folds for each of 7500 candidates, totalling 37500 fits


In [42]:
print('lgbm 파라미터: ', gscv_lgbm.best_params_)
print('lgbm 예측 정확도: {:.4f}'.format(gscv_lgbm.best_score_))
print(np.sqrt(-gscv_lgbm.best_score_))

lgbm 파라미터:  {'learning_rate': 0.05, 'max_depth': 20, 'min_child_samples': 30, 'n_estimators': 100, 'num_leaves': 50, 'subsample': 0.5}
lgbm 예측 정확도: -1396907.2529
1181.9083098611707


## Inference

In [50]:
preds = gscv_lgbm.predict(test_x)

In [51]:
len(preds)

1092

In [52]:
(preds<=0).sum()

83

In [53]:
preds[preds<=0]=0 

In [54]:
preds

array([2854.02600396,  319.61738577, 3143.52612352, ...,  498.20316043,
        522.52433217,  459.60748932])

## Submission

In [28]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [33]:
submission['answer'] = preds

In [35]:
submission.to_csv('./submission_231115.csv', index=False)