In [None]:
!pip install autogluon

In [1]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

### 학습 데이터 로딩

In [3]:
drive_path = "/content/drive/MyDrive/데이터 경진대회/제주 특산물 가격 예측"
train_data_path = drive_path + "/train.csv"
test_data_path = drive_path + "/test.csv"
submission_path = drive_path + "/sample_submission.csv"

In [4]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

### 랜덤 시드 번호 지정

In [5]:
random_seed = 42

### 데이터 전처리

In [19]:
def map_to_model_train_input_data(df) :
  transformed_df = df[["ID", "timestamp", "price(원/kg)"]].copy()
  transformed_df["item_id"] = transformed_df["ID"].str.replace(r'_\d{8}$', '', regex=True)
  transformed_df = transformed_df.rename(columns={'price(원/kg)': 'y'})
  return transformed_df

In [20]:
model_train_input = map_to_model_train_input_data(train_df)

In [21]:
time_series_data = TimeSeriesDataFrame(model_train_input.drop(columns=["ID"]))

In [23]:
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="y",
    eval_metric="RMSE"
)

In [None]:
predictor.fit(time_series_data, random_seed=random_seed)

In [None]:
predictor.refit_full()

In [26]:
pred = predictor.predict(time_series_data, random_seed=random_seed)

INFO:lightning_fabric.utilities.seed:Global seed set to 42
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


In [27]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3267.430632,1331.178563,2009.883061,2500.718941,2917.926294,3289.385218,3666.367876,4057.338970,4544.137411,5317.154420
TG_A_J,2023-03-05,613.399876,-1640.499803,-868.076011,-311.984476,157.961232,593.611114,1041.550743,1515.186085,2068.962429,2833.426812
TG_A_J,2023-03-06,3062.702166,-297.482752,1020.896791,1827.865113,2472.375984,3070.894940,3670.766445,4346.067951,5110.855335,6369.115225
TG_A_J,2023-03-07,3457.144647,-256.408123,1014.433332,2004.326054,2747.181574,3448.386784,4150.071801,4876.925690,5732.017292,7090.121087
TG_A_J,2023-03-08,3357.136627,-540.531784,843.662040,1801.150393,2611.578797,3370.875726,4137.962038,4940.406982,5946.660945,7311.684927
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,492.997841,-154.064923,100.713694,261.549080,393.129356,512.939985,631.683710,759.471722,909.018135,1128.595079
RD_F_J,2023-03-28,516.488705,-186.772902,102.598754,258.189246,398.162304,523.497273,645.413322,781.181480,938.677191,1165.003650
RD_F_J,2023-03-29,514.444653,-196.348553,96.625750,258.580808,395.465607,523.376401,647.572299,782.616567,941.959033,1163.596652
RD_F_J,2023-03-30,491.955314,-222.693186,80.035492,242.278303,376.859391,499.855953,623.765194,762.937967,925.997067,1149.978445


### 해당 날짜의 가격이 0인지 0이 아닌지를 판단하는 모델 적용

In [65]:
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from sklearn.preprocessing import LabelEncoder
import os
import random

In [55]:
class BinaryLSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, output_size, device):
    super(BinaryLSTM, self).__init__()
    self.device = device
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
    out, _ = self.lstm(x, (h0, c0))
    out = self.fc(out[:, -1, :])
    out = self.sigmoid(out)
    return out

In [58]:
def label_encoding_category_columns_and_get_label_encoder(train_df, test_df, category_columns):
  le_dict = {}
  for column in category_columns:
    le = LabelEncoder()
    train_df[column] = le.fit_transform(train_df[column])
    test_df[column] = le.transform(test_df[column])
  return le_dict

In [61]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(random_seed) # Seed 고정

In [72]:
# lstm 모델의 입력을 들어갈 데이터 전처리
def handle_data_to_binary_lstm(df) :
  transformed_df = df[["ID", "timestamp", "item", "corporation", "location", "price(원/kg)"]].copy()
  transformed_df["timestamp"] = pd.to_datetime(df["timestamp"])
  transformed_df["ID"] = transformed_df["ID"].str.replace(r'_\d{8}$', '', regex=True)
  transformed_df["day of week"] = transformed_df["timestamp"].dt.dayofweek
  transformed_df["month"] = transformed_df["timestamp"].dt.month
  transformed_df["day of month"] = transformed_df["timestamp"].dt.day
  transformed_df = transformed_df.rename(columns={'timestamp': 'ds', 'price(원/kg)': 'y'})
  transformed_df.loc[transformed_df["y"] > 0, "y"] = 1
  return transformed_df.drop(columns=["ds"])

In [63]:
def separate_data_label(df, label_key, except_keys) :
  data = df.drop(columns=[label_key, *except_keys])
  label = df[label_key]
  return data, label

In [66]:
class BinaryLstmDataset(Dataset):
  def __init__(self, dataframe):
    self.data = dataframe[['day of week', 'month', 'day of month']].values
    self.labels = dataframe['y'].values

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = {'input': torch.tensor(self.data[idx], dtype=torch.float32),
              'label': torch.tensor(self.labels[idx], dtype=torch.float32)}
    return sample

In [67]:
def build_lstm_dataset(df, seq, key_position) :
  x = []
  y = []

  for i in range(0, len(df) - seq):
    _x = df[i:i+seq, 1:]
    _y = df[i+seq, key_position]
    x.append(_x.astype(np.float32))
    y.append(_y)
  return np.array(x), np.array(y)

In [68]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
batch_size = 256
seq_len = 356

In [69]:
qual_col = ['item', 'corporation', 'location']
le_dict = label_encoding_category_columns_and_get_label_encoder(train_df, test_df, qual_col)

In [73]:
lstm_train_data = handle_data_to_binary_lstm(train_df)

In [76]:
lstm_train_data.head(5)

Unnamed: 0,ID,item,corporation,location,y,day of week,month,day of month
0,TG_A_J,4,0,0,0.0,1,1,1
1,TG_A_J,4,0,0,0.0,2,1,2
2,TG_A_J,4,0,0,1.0,3,1,3
3,TG_A_J,4,0,0,1.0,4,1,4
4,TG_A_J,4,0,0,1.0,5,1,5


In [74]:
test_df_with_empty_price = test_df.copy()
test_df_with_empty_price["price(원/kg)"] = 0
lstm_test_data = handle_data_to_binary_lstm(test_df_with_empty_price)

In [75]:
lstm_test_data.head(5)

Unnamed: 0,ID,item,corporation,location,y,day of week,month,day of month
0,TG_A_J,4,0,0,0,5,3,4
1,TG_A_J,4,0,0,0,6,3,5
2,TG_A_J,4,0,0,0,0,3,6
3,TG_A_J,4,0,0,0,1,3,7
4,TG_A_J,4,0,0,0,2,3,8


In [77]:
category_list = train_df["ID"].str.replace(r'_\d{8}$', '', regex=True).unique()

In [78]:
total_x, total_y = np.empty((0, seq_len, 7)), np.array([])
for category in category_list:
  current_category_data = lstm_train_data[lstm_train_data["ID"] == category]
  _x, _y = build_lstm_dataset(np.array(current_category_data), seq_len, 4)

  total_x = np.concatenate((total_x, _x), axis = 0)
  total_y = np.concatenate((total_y, _y), axis = 0)

In [79]:
_train_x_tensor = torch.FloatTensor(total_x).to(device)
_train_y_tensor = torch.FloatTensor(total_y).to(device)
lstm_dataset = TensorDataset(_train_x_tensor, _train_y_tensor)
lstm_dataloader = DataLoader(lstm_dataset, batch_size, shuffle=True)

In [80]:
train_dataloader, val_dataloader = random_split(lstm_dataset, [0.8, 0.2])

In [81]:
train_dataloader = DataLoader(train_dataloader, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataloader, batch_size=batch_size, shuffle=False)

In [82]:
binary_lstm_model = BinaryLSTM(7, seq_len, 1, 2, device).to(device)

In [83]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(binary_lstm_model.parameters(), lr=0.0005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
iter = 100

In [85]:
read_weight_file_path = drive_path + f"/binary_lstm_weight.pt"
write_weight_file_path = drive_path + f"/binary_lstm_weight_1117.pt"

In [86]:
binary_lstm_model.load_state_dict(torch.load(read_weight_file_path, map_location=torch.device(device)))

<All keys matched successfully>

In [None]:
for epoch in range(iter) :
  binary_lstm_model.train()

  for i, batch in enumerate(train_dataloader):
    inputs = batch[0]
    labels = batch[1].long()
    outputs = binary_lstm_model(inputs)
    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % 20 == 19 :
      print(f"Epoch {epoch + 1} / {iter} - {i}, train loss : {loss:.4f}")

  scheduler.step()
  binary_lstm_model.eval()
  with torch.no_grad():
    val_loss = 0

    for inputs, labels in val_dataloader:
      outputs = binary_lstm_model(inputs)
      val_loss += criterion(outputs, labels.long())

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1} / {iter}, valiodation loss : {avg_val_loss:.4f}")

    if avg_val_loss < min_val_loss :
      print(f"model weight saved! valiodation loss : {min_val_loss:.4f} -> {avg_val_loss:.4f}")
      min_val_loss = avg_val_loss
      weight_path = drive_path + f"/binary_lstm_weight.pt"
      torch.save(binary_lstm_model.state_dict(), weight_path)

In [87]:
non_zero_dict = {}
for category in category_list :
  current_category_data = lstm_train_data[lstm_train_data["ID"] == category].iloc[-seq_len:]
  current_category_data = current_category_data.drop(columns=["ID"])

  current_category_predict_data = lstm_test_data[lstm_test_data["ID"] == category]
  current_category_predict_data = current_category_predict_data.drop(columns=["ID"])

  predict_steps = len(current_category_predict_data)

  predict_list = []
  for i in range(predict_steps) :
    current_input = pd.concat([current_category_data[i:], current_category_predict_data[:i]], axis=0)
    input_tensor = torch.tensor(current_input.values, dtype=torch.float32)
    input_tensor = input_tensor[None, :]

    with torch.no_grad():
      binary_lstm_model.eval()
      output = binary_lstm_model(input_tensor).tolist()[0]
      predict = output.index(max(output))

    current_category_predict_data.loc[current_category_predict_data.index[i], "y"] = predict
    predict_list.append(predict)

  non_zero_dict[category] = predict_list
  print(f"{category} non-zero predict is done!")

TG_A_J non-zero predict is done!
TG_A_S non-zero predict is done!
TG_B_J non-zero predict is done!
TG_B_S non-zero predict is done!
TG_C_J non-zero predict is done!
TG_C_S non-zero predict is done!
TG_D_J non-zero predict is done!
TG_D_S non-zero predict is done!
TG_E_J non-zero predict is done!
TG_E_S non-zero predict is done!
CR_A_J non-zero predict is done!
CR_B_J non-zero predict is done!
CR_C_J non-zero predict is done!
CR_D_J non-zero predict is done!
CR_D_S non-zero predict is done!
CR_E_J non-zero predict is done!
CR_E_S non-zero predict is done!
CB_A_J non-zero predict is done!
CB_A_S non-zero predict is done!
CB_D_J non-zero predict is done!
CB_E_J non-zero predict is done!
RD_A_J non-zero predict is done!
RD_A_S non-zero predict is done!
RD_C_S non-zero predict is done!
RD_D_J non-zero predict is done!
RD_D_S non-zero predict is done!
RD_E_J non-zero predict is done!
RD_E_S non-zero predict is done!
BC_A_J non-zero predict is done!
BC_A_S non-zero predict is done!
BC_B_J non

In [98]:
binary_price_df = pd.DataFrame(columns=["category", "y"])

In [None]:
for category in category_list :
  current_non_zero_list = non_zero_dict[category]
  for binary_data in current_non_zero_list:
    binary_price_df = binary_price_df.append({"category" : category, "y" : binary_data}, ignore_index=True)

In [138]:
binary_price_df.head(5)

Unnamed: 0,category,y
0,TG_A_J,1
1,TG_A_J,0
2,TG_A_J,1
3,TG_A_J,1
4,TG_A_J,1


### 제출 파일 생성

In [132]:
submission = pd.read_csv(submission_path)

In [133]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [134]:
time_series_result = submission.copy()
time_series_result["answer"] = pred.reset_index()["mean"]
time_series_result.loc[time_series_result["answer"] < 0, "answer"] = 0.0

In [135]:
binary_price_df["y"].values * time_series_result["answer"].values

array([3267.4306323488863, 0.0, 3062.702166184844, ..., 514.4446527226123,
       491.9553139580838, 477.6558167332842], dtype=object)

In [136]:
submission["answer"] = binary_price_df["y"].values * time_series_result["answer"].values

In [131]:
submission.to_csv(drive_path + "/submission_231117_2.csv", index=False)

In [137]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3267.430632
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,3062.702166
3,TG_A_J_20230307,3457.144647
4,TG_A_J_20230308,3357.136627
...,...,...
1087,RD_F_J_20230327,492.997841
1088,RD_F_J_20230328,516.488705
1089,RD_F_J_20230329,514.444653
1090,RD_F_J_20230330,491.955314


### 테스트용 코드

In [50]:
def show_all_dataframe(df):
  pd.set_option("display.max_rows", None)
  print(df)
  pd.reset_option("display.max_rows")

In [None]:
show_all_dataframe(submission)