#Catboost

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Import

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [3]:
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import catboost
from catboost import Pool, CatBoostRegressor
import tqdm
from tqdm import tqdm
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings(action='ignore')

In [4]:
print('Pandas Version :', pd.__version__)
print('Numpy Version :', np.__version__)
print('Catboost Version :', catboost.__version__)
print('sklearn Version :', sklearn.__version__)

Pandas Version : 1.5.3
Numpy Version : 1.23.5
Catboost Version : 1.2.2
sklearn Version : 1.2.2


In [5]:
# 파이썬 버전 확인
!python --version

Python 3.10.12


In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

##Load Data

In [7]:
train_df = pd.read_csv('/content/drive/MyDrive/제주 특산물 가격 예측 AI 경진대회/open/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/제주 특산물 가격 예측 AI 경진대회/open/test.csv')

##Data Pre-Processing

In [8]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], format='%Y-%m-%d')
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], format='%Y-%m-%d')

In [9]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].dt.year
train_df['month'] = train_df['timestamp'].dt.month
train_df['day'] = train_df['timestamp'].dt.day

test_df['year'] = test_df['timestamp'].dt.year
test_df['month'] = test_df['timestamp'].dt.month
test_df['day'] = test_df['timestamp'].dt.day

In [10]:
# 연 기준 며칠 째인지
train_df['day_of_year'] = train_df['timestamp'].dt.dayofyear
test_df['day_of_year'] = test_df['timestamp'].dt.dayofyear

# 요일 0 = monday, 6 = sunday
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek
test_df['day_of_week'] = test_df['timestamp'].dt.dayofweek

# 주말 여부
train_df['holiday'] = train_df.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)
test_df['holiday'] = test_df.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)

In [11]:
data = pd.DataFrame()
for i in train_df['item'].unique():
  data1 = train_df[train_df['item'] == i].groupby('month')['price(원/kg)'].mean()
  data2 = train_df[train_df['item'] == i].groupby('month')['price(원/kg)'].std()
  data3 = train_df[train_df['item'] == i].groupby('month')['price(원/kg)'].median()
  d = pd.DataFrame()
  d['price_mean'] = data1
  d['price_std'] = data2
  d['price_median'] = data3
  d['item'] = i
  data = pd.concat([data, d], axis = 0)

data = data.reset_index()
data.head()

Unnamed: 0,month,price_mean,price_std,price_median,item
0,1,1588.269032,1110.477409,1722.0,TG
1,2,2030.6,1383.794058,2211.5,TG
2,3,3299.605512,2094.885739,3837.5,TG
3,4,3899.4075,3135.842509,4931.5,TG
4,5,5419.03629,3937.887525,6598.0,TG


In [12]:
for i in data['item'].unique():
  m = data['price_mean'].mean()
  idx1 = data[data['price_mean'] >= m].index
  idx2 = data[data['price_mean'] < m].index
  for j in idx1:
    data.loc[j, 'price_mean'] = 1
  for j in idx2:
    data.loc[j, 'price_mean'] = 0

data.head()

Unnamed: 0,month,price_mean,price_std,price_median,item
0,1,1.0,1110.477409,1722.0,TG
1,2,1.0,1383.794058,2211.5,TG
2,3,1.0,2094.885739,3837.5,TG
3,4,1.0,3135.842509,4931.5,TG
4,5,1.0,3937.887525,6598.0,TG


In [13]:
for i in data['item'].unique():
  m = data['price_std'].mean()
  idx1 = data[data['price_std'] >= m].index
  idx2 = data[data['price_std'] < m].index
  for j in idx1:
    data.loc[j, 'price_std'] = 1
  for j in idx2:
    data.loc[j, 'price_std'] = 0

data.head()

Unnamed: 0,month,price_mean,price_std,price_median,item
0,1,1.0,1.0,1722.0,TG
1,2,1.0,1.0,2211.5,TG
2,3,1.0,1.0,3837.5,TG
3,4,1.0,1.0,4931.5,TG
4,5,1.0,1.0,6598.0,TG


In [14]:
for i in data['item'].unique():
  m = data['price_median'].mean()
  idx1 = data[data['price_median'] >= m].index
  idx2 = data[data['price_median'] < m].index
  for j in idx1:
    data.loc[j, 'price_median'] = 1
  for j in idx2:
    data.loc[j, 'price_median'] = 0

data.head()

Unnamed: 0,month,price_mean,price_std,price_median,item
0,1,1.0,1.0,1.0,TG
1,2,1.0,1.0,1.0,TG
2,3,1.0,1.0,1.0,TG
3,4,1.0,1.0,1.0,TG
4,5,1.0,1.0,1.0,TG


In [15]:
tqdm.pandas()
train_df['price_mean'] = train_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_mean'].values[0], axis = 1)
tqdm.pandas()
test_df['price_mean'] = test_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_mean'].values[0], axis = 1)

tqdm.pandas()
train_df['price_std'] = train_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_std'].values[0], axis = 1)
tqdm.pandas()
test_df['price_std'] = test_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_std'].values[0], axis = 1)

tqdm.pandas()
train_df['price_median'] = train_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_median'].values[0], axis = 1)
tqdm.pandas()
test_df['price_median'] = test_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_median'].values[0], axis = 1)

100%|██████████| 59397/59397 [00:54<00:00, 1096.82it/s]
100%|██████████| 1092/1092 [00:00<00:00, 1872.68it/s]
100%|██████████| 59397/59397 [00:34<00:00, 1736.56it/s]
100%|██████████| 1092/1092 [00:00<00:00, 1875.26it/s]
100%|██████████| 59397/59397 [00:34<00:00, 1734.60it/s]
100%|██████████| 1092/1092 [00:00<00:00, 2056.54it/s]


In [16]:
train_df.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day,day_of_year,day_of_week,holiday,price_mean,price_std,price_median
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,0,1.0,1.0,1.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,2,2,0,1.0,1.0,1.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,3,3,0,1.0,1.0,1.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,4,4,0,1.0,1.0,1.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,5,5,1,1.0,1.0,1.0


In [17]:
test_df.head()

Unnamed: 0,ID,timestamp,item,corporation,location,year,month,day,day_of_year,day_of_week,holiday,price_mean,price_std,price_median
0,TG_A_J_20230304,2023-03-04,TG,A,J,2023,3,4,63,5,1,1.0,1.0,1.0
1,TG_A_J_20230305,2023-03-05,TG,A,J,2023,3,5,64,6,1,1.0,1.0,1.0
2,TG_A_J_20230306,2023-03-06,TG,A,J,2023,3,6,65,0,0,1.0,1.0,1.0
3,TG_A_J_20230307,2023-03-07,TG,A,J,2023,3,7,66,1,0,1.0,1.0,1.0
4,TG_A_J_20230308,2023-03-08,TG,A,J,2023,3,8,67,2,0,1.0,1.0,1.0


In [18]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

test_x = test_df.drop(columns=['ID', 'timestamp'])

In [19]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location', 'price_mean', 'price_std', 'price_median']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

    if(i == 'item'):
      print(le.inverse_transform([0, 1, 2, 3, 4]))
    if(i == 'location'):
      print(le.inverse_transform([0,1]))
    if(i == 'corporation'):
      print(le.inverse_transform([0,1,2,3,4]))

print('Done.')

['BC' 'CB' 'CR' 'RD' 'TG']
['A' 'B' 'C' 'D' 'E']
['J' 'S']
Done.


##Validation set으로 RMSE 평가

In [20]:
index_list = []
preds = []
non_pred = []
rmse_true = []
rmse_pred = []
v_data = pd.DataFrame()

for i in train_x['item'].unique():
  for k in train_x['location'].unique():
    X_train = train_x[(train_x['item'] == i) & (train_x['location'] == k)]
    X_valid = pd.DataFrame()
    for c in X_train['corporation'].unique():
      x = X_train[X_train['corporation'] == c].iloc[-28:]
      X_valid = pd.concat([X_valid, x], axis = 0)
    X_train = X_train.drop(X_valid.index.values, axis=0, inplace=False)
    v_data = pd.concat([v_data, X_valid], axis = 0)

    Y_train = train_y[X_train.index]
    Y_valid = train_y[X_valid.index]
    bound = Y_train[Y_train!=0].min()
    rmse_true.append(Y_valid)

    X_test = test_x[(test_x['item'] == i) & (test_x['location'] == k)]
    index_list.append(X_test.index)

    rob = MinMaxScaler()
    rob_data = rob.fit_transform(Y_train.values.reshape(-1,1))

    if i == 0:
      item = "BC"
    elif i == 1:
      item = "CB"
    elif i == 2:
      item = "CR"
    elif i == 3:
      item = "RD"
    elif i == 4:
      item = "TG"

    if k == 0:
      location = 'J'
    else:
      location = 'S'

    model = CatBoostRegressor(verbose = 100, random_state = 41)
    print("------------------------------------- item = ", item, "location = ", location, "학습-------------------------------------")
    model.fit(X_train, rob_data.reshape(-1))
    pred = model.predict(X_valid)
    pred = rob.inverse_transform(pred.reshape(-1,1))
    data = []
    for j in pred:
      if j < bound:
        data.append(0)
      elif bound == 0:
        data.append(0)
      else:
        data.append(float(j))
    rmse_pred.append(data)

    if(X_test.empty != True):
      pred = model.predict(X_test)
      pred = rob.inverse_transform(pred.reshape(-1,1))
      data = []
      for j in pred:
        if j < bound:
          data.append(0)
        elif bound == 0:
          data.append(0)
        else:
          data.append(float(j))
      preds.append(data)

------------------------------------- item =  TG location =  J 학습-------------------------------------
Learning rate set to 0.056261
0:	learn: 0.1426095	total: 48.1ms	remaining: 48s
100:	learn: 0.0926291	total: 213ms	remaining: 1.9s
200:	learn: 0.0884681	total: 379ms	remaining: 1.51s
300:	learn: 0.0855019	total: 710ms	remaining: 1.65s
400:	learn: 0.0833893	total: 942ms	remaining: 1.41s
500:	learn: 0.0816856	total: 1.12s	remaining: 1.11s
600:	learn: 0.0801544	total: 1.29s	remaining: 855ms
700:	learn: 0.0787720	total: 1.63s	remaining: 696ms
800:	learn: 0.0776235	total: 1.81s	remaining: 449ms
900:	learn: 0.0764982	total: 1.98s	remaining: 218ms
999:	learn: 0.0755443	total: 2.15s	remaining: 0us
------------------------------------- item =  TG location =  S 학습-------------------------------------
Learning rate set to 0.056261
0:	learn: 0.1938121	total: 2.79ms	remaining: 2.79s
100:	learn: 0.0934653	total: 170ms	remaining: 1.51s
200:	learn: 0.0877786	total: 336ms	remaining: 1.34s
300:	learn: 0

In [21]:
p1 = np.concatenate(rmse_true).tolist()
p2 = np.concatenate(rmse_pred).tolist()
RMSE = mean_squared_error(p1, p2)**0.5
RMSE

693.9506315217596

##Regression Model Fit & Inference

전체 데이터에 대해 학습 및 예측 수행

In [22]:
index_list = []
preds = []

for i in train_x['item'].unique():
  for k in train_x['location'].unique():
    X_train = train_x[(train_x['item'] == i) & (train_x['location'] == k)]

    Y_train = train_y[X_train.index]
    bound = Y_train[Y_train!=0].min()

    X_test = test_x[(test_x['item'] == i) & (test_x['location'] == k)]
    index_list.append(X_test.index)

    rob = MinMaxScaler()
    rob_data = rob.fit_transform(Y_train.values.reshape(-1,1))

    if i == 0:
      item = "BC"
    elif i == 1:
      item = "CB"
    elif i == 2:
      item = "CR"
    elif i == 3:
      item = "RD"
    elif i == 4:
      item = "TG"

    if k == 0:
      location = 'J'
    else:
      location = 'S'

    model = CatBoostRegressor(verbose = 100, random_state = 41)
    print("------------------------------------- item = ", item, "location = ", location, "학습-------------------------------------")
    model.fit(X_train, rob_data.reshape(-1))

    if(X_test.empty != True):
      pred = model.predict(X_test)
      pred = rob.inverse_transform(pred.reshape(-1,1))
      data = []
      for j in pred:
        if j < bound:
          data.append(0)
        elif bound == 0:
          data.append(0)
        else:
          data.append(float(j))
      preds.append(data)

------------------------------------- item =  TG location =  J 학습-------------------------------------
Learning rate set to 0.056427
0:	learn: 0.1418986	total: 1.87ms	remaining: 1.87s
100:	learn: 0.0922916	total: 178ms	remaining: 1.59s
200:	learn: 0.0882929	total: 351ms	remaining: 1.4s
300:	learn: 0.0854677	total: 548ms	remaining: 1.27s
400:	learn: 0.0832557	total: 722ms	remaining: 1.08s
500:	learn: 0.0812830	total: 892ms	remaining: 889ms
600:	learn: 0.0797721	total: 1.07s	remaining: 708ms
700:	learn: 0.0784895	total: 1.24s	remaining: 528ms
800:	learn: 0.0774105	total: 1.41s	remaining: 350ms
900:	learn: 0.0763830	total: 1.59s	remaining: 175ms
999:	learn: 0.0754643	total: 1.76s	remaining: 0us
------------------------------------- item =  TG location =  S 학습-------------------------------------
Learning rate set to 0.056427
0:	learn: 0.1925944	total: 1.93ms	remaining: 1.93s
100:	learn: 0.0929251	total: 171ms	remaining: 1.52s
200:	learn: 0.0873939	total: 355ms	remaining: 1.41s
300:	learn:

In [23]:
list1 = np.concatenate(preds).tolist()
list2 = np.concatenate(index_list).tolist()

In [24]:
p = pd.DataFrame()
p['pred'] = list1
p['idx'] = list2

##Submission

In [25]:
submission = pd.read_csv('/content/drive/MyDrive/제주 특산물 가격 예측 AI 경진대회/open/sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [26]:
submission['answer'] = p.sort_values(by=['idx']).reset_index().pred
submission.to_csv('./catboost_submission.csv', index=False)
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3476.009506
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3330.207719
3,TG_A_J_20230307,4109.363862
4,TG_A_J_20230308,3478.802651
...,...,...
1087,RD_F_J_20230327,498.407874
1088,RD_F_J_20230328,482.391520
1089,RD_F_J_20230329,449.201374
1090,RD_F_J_20230330,486.539725


**런타임 해제 후 T4 GPU로 연결해야 합니다**

#auto-gluon

- autogluon version : 0.8.2

##Import

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.tabular[all]==0.8.2 (from autogluon)
  Downloading autogluon.tabular-0.8.2-py3-none-any.whl (285 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.7/285.7 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==0.8.2 (from autogluon)
  Downloading autogluon.multimodal-0.8.2-py3-none-any.whl (372 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [None]:
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import autogluon
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import tqdm
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings(action='ignore')

In [29]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

##Load Data

In [30]:
train_df = pd.read_csv('/content/drive/MyDrive/제주 특산물 가격 예측 AI 경진대회/open/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/제주 특산물 가격 예측 AI 경진대회/open/test.csv')
cat_pred = pd.read_csv('/content/catboost_submission.csv')

##Data Pre-Processing

In [31]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], format='%Y-%m-%d')
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], format='%Y-%m-%d')

In [32]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].dt.year
train_df['month'] = train_df['timestamp'].dt.month
train_df['day'] = train_df['timestamp'].dt.day

test_df['year'] = test_df['timestamp'].dt.year
test_df['month'] = test_df['timestamp'].dt.month
test_df['day'] = test_df['timestamp'].dt.day

In [33]:
# 연 기준 며칠 째인지
train_df['day_of_year'] = train_df['timestamp'].dt.dayofyear
test_df['day_of_year'] = test_df['timestamp'].dt.dayofyear

# 요일 0 = monday, 6 = sunday
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek
test_df['day_of_week'] = test_df['timestamp'].dt.dayofweek

# 주말 여부
train_df['holiday'] = train_df.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)
test_df['holiday'] = test_df.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)

In [34]:
data = pd.DataFrame()
for i in train_df['item'].unique():
  data1 = train_df[train_df['item'] == i].groupby('month')['price(원/kg)'].mean()
  data2 = train_df[train_df['item'] == i].groupby('month')['price(원/kg)'].std()
  data3 = train_df[train_df['item'] == i].groupby('month')['supply(kg)'].mean()
  d = pd.DataFrame()
  d['price_mean'] = data1
  d['price_std'] = data2
  d['supply_mean'] = data3
  d['item'] = i
  data = pd.concat([data, d], axis = 0)

data = data.reset_index()
data.head()

Unnamed: 0,month,price_mean,price_std,supply_mean,item
0,1,1588.269032,1110.477409,39655.713419,TG
1,2,2030.6,1383.794058,19364.09156,TG
2,3,3299.605512,2094.885739,7407.45585,TG
3,4,3899.4075,3135.842509,1914.845917,TG
4,5,5419.03629,3937.887525,781.390081,TG


In [35]:
for i in data['item'].unique():
  m = data[data['item'] == i]['price_mean'].mean()
  idx1 = data[(data['item'] == i) & (data['price_mean'] >= m)].index
  idx2 = data[(data['item'] == i) & (data['price_mean'] < m)].index
  for j in idx1:
    data.loc[j, 'price_mean'] = 1
  for j in idx2:
    data.loc[j, 'price_mean'] = 0

data.head()

Unnamed: 0,month,price_mean,price_std,supply_mean,item
0,1,0.0,1110.477409,39655.713419,TG
1,2,0.0,1383.794058,19364.09156,TG
2,3,1.0,2094.885739,7407.45585,TG
3,4,1.0,3135.842509,1914.845917,TG
4,5,1.0,3937.887525,781.390081,TG


In [36]:
for i in data['item'].unique():
  m = data[data['item'] == i]['price_std'].mean()
  idx1 = data[(data['item'] == i) & (data['price_std'] >= m)].index
  idx2 = data[(data['item'] == i) & (data['price_std'] < m)].index
  for j in idx1:
    data.loc[j, 'price_std'] = 1
  for j in idx2:
    data.loc[j, 'price_std'] = 0

data.head()

Unnamed: 0,month,price_mean,price_std,supply_mean,item
0,1,0.0,0.0,39655.713419,TG
1,2,0.0,0.0,19364.09156,TG
2,3,1.0,0.0,7407.45585,TG
3,4,1.0,1.0,1914.845917,TG
4,5,1.0,1.0,781.390081,TG


In [37]:
for i in data['item'].unique():
  m = data[data['item'] == i]['supply_mean'].mean()
  idx1 = data[(data['item'] == i) & (data['supply_mean'] >= m)].index
  idx2 = data[(data['item'] == i) & (data['supply_mean'] < m)].index
  for j in idx1:
    data.loc[j, 'supply_mean'] = 1
  for j in idx2:
    data.loc[j, 'supply_mean'] = 0

data.head()

Unnamed: 0,month,price_mean,price_std,supply_mean,item
0,1,0.0,0.0,1.0,TG
1,2,0.0,0.0,1.0,TG
2,3,1.0,0.0,0.0,TG
3,4,1.0,1.0,0.0,TG
4,5,1.0,1.0,0.0,TG


In [38]:
tqdm.pandas()
train_df['price_mean'] = train_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_mean'].values[0], axis = 1)
tqdm.pandas()
train_df['price_std'] = train_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'price_std'].values[0], axis = 1)
tqdm.pandas()
train_df['supply_mean'] = train_df.progress_apply(lambda x : data.loc[(data.item == x['item']) & (data.month == x['month']) ,'supply_mean'].values[0], axis = 1)

100%|██████████| 59397/59397 [00:43<00:00, 1361.83it/s]
100%|██████████| 59397/59397 [00:34<00:00, 1712.66it/s]
100%|██████████| 59397/59397 [00:34<00:00, 1732.03it/s]


In [39]:
train_df['item_id'] = train_df.ID.str[0:6]
train_df.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day,day_of_year,day_of_week,holiday,price_mean,price_std,supply_mean,item_id
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,0,0.0,0.0,1.0,TG_A_J
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,2,2,0,0.0,0.0,1.0,TG_A_J
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,3,3,0,0.0,0.0,1.0,TG_A_J
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,4,4,0,0.0,0.0,1.0,TG_A_J
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,5,5,1,0.0,0.0,1.0,TG_A_J


In [40]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_df[i]=le.fit_transform(train_df[i])

    if(i == 'item'):
      print(le.inverse_transform([0, 1, 2, 3, 4]))
    if(i == 'location'):
      print(le.inverse_transform([0,1]))
    if(i == 'corporation'):
      print(le.inverse_transform([0,1,2,3,4]))

print('Done.')

['BC' 'CB' 'CR' 'RD' 'TG']
['A' 'B' 'C' 'D' 'E']
['J' 'S']
Done.


##Regression Model Fit

In [None]:
data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)
# seed 고정
predictor.fit( data, random_seed=42, )

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
INFO:lightning_fabric.utilities.seed:Global seed set to 42
AutoGluon will save models to AutogluonModels/ag-20231113_135513/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['item', 'corporation', 'location', 'supply(kg)', 'year', 'month', 'd

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7847e15513c0>

- Total runtime: 192.73 s
- Best model: WeightedEnsemble
- Best model score: -715.9105

In [None]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	8.71    s     = Training runtime
Fitting model: DeepAR_FULL | Skipping fit via cloning parent ...
Fitting model: WeightedEnsemble_FULL | Skipping fit via cloning parent ...
Refit complete. Models trained: ['Naive_FULL', 'SeasonalNaive_FULL', 'Theta_FULL', 'AutoETS_FULL', 'RecursiveTabular_FULL', 'DeepAR_FULL', 'WeightedEnsemble_FULL']
Total runtime: 8.86 s
Updated best model to 'WeightedEnsemble_FULL' (Previously 'W

{'Naive': 'Naive_FULL',
 'SeasonalNaive': 'SeasonalNaive_FULL',
 'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'DeepAR': 'DeepAR_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [None]:
# seed 고정
pred = predictor.predict(data, random_seed=42, )

INFO:lightning_fabric.utilities.seed:Global seed set to 42
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


In [None]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3292.724485,1623.328158,2222.583787,2609.102161,2948.093732,3240.357175,3561.357191,3907.848518,4309.873972,4909.457848
TG_A_J,2023-03-05,552.225002,-1177.983895,-585.709376,-158.971284,205.580749,546.104663,886.531705,1250.814063,1677.220835,2269.247734
TG_A_J,2023-03-06,3176.135653,594.562365,1521.806391,2158.656155,2683.650853,3188.898958,3712.336151,4266.716957,4890.218942,5770.687700
TG_A_J,2023-03-07,3428.709313,492.522005,1517.167296,2287.752996,2894.149955,3459.831872,4025.899408,4675.046734,5370.626207,6481.507827
TG_A_J,2023-03-08,3365.464408,124.640080,1282.532260,2086.559389,2752.736537,3386.835868,3989.046771,4658.565839,5443.961254,6668.588785
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,501.589305,-11.884302,166.969760,296.065807,401.846794,496.163921,597.864592,703.015071,839.511645,1027.219331
RD_F_J,2023-03-28,508.068504,-42.381796,165.830790,303.855541,414.567149,516.220115,622.059178,735.324283,871.946004,1063.376638
RD_F_J,2023-03-29,511.985792,-72.108838,167.863634,303.493017,415.115239,521.684614,628.502281,744.705741,882.938807,1081.459771
RD_F_J,2023-03-30,496.144908,-90.307034,145.472903,290.552451,403.866852,506.140911,614.150565,734.790802,876.424855,1078.947941


In [None]:
test_df['cat_pred'] = cat_pred['answer']
test_df['gluon_pred'] = pred.reset_index()['mean']
test_df.loc[ test_df['gluon_pred'] < 0.0, 'gluon_pred'] = 0.0

In [43]:
test_df['mean'] = (test_df['cat_pred'] + test_df['gluon_pred']) / 2
test_df.loc[(test_df['day_of_week'] == 6) & (test_df['mean'] != 0), 'mean'] = 0

##Submission

In [44]:
submission = pd.read_csv('/content/drive/MyDrive/제주 특산물 가격 예측 AI 경진대회/open/sample_submission.csv')
submission['answer'] = test_df['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0
submission.to_csv('./final_submission.csv', index=False)
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3384.366995
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3253.171686
3,TG_A_J_20230307,3769.036588
4,TG_A_J_20230308,3422.133530
...,...,...
1087,RD_F_J_20230327,499.998590
1088,RD_F_J_20230328,495.230012
1089,RD_F_J_20230329,480.593583
1090,RD_F_J_20230330,491.342317
