# test train encoded csv 생성 코드

## Import

In [55]:
import random
import pandas as pd
import numpy as np
import os
from pytimekr import pytimekr
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [56]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [57]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
international_trade_df = pd.read_csv('../data/international_trade.csv')
submission = pd.read_csv('../data/sample_submission.csv')

In [58]:
import pandas as pd 

def process_dataset(dataset):
    # timestamp 열을 datetime 객체로 변환
    dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])

    # 연도, 월, 일, 주, 요일 열 추가
    dataset['year'] = dataset['timestamp'].dt.isocalendar().year
    dataset['month'] = dataset['timestamp'].dt.month
    dataset['day'] = dataset['timestamp'].dt.day
    dataset['week'] = dataset['timestamp'].dt.isocalendar().week
    dataset['weekday'] = dataset['timestamp'].dt.isocalendar().day

    # 주말 및 평일 여부를 나타내는 열 추가
    dataset['isWeekday'] = ((dataset['weekday'] >= 1) & (dataset['weekday'] <= 5)).astype(int)
    dataset['isSaturday'] = (dataset['weekday'] == 6).astype(int)
    dataset['isSunday'] = (dataset['weekday'] == 7).astype(int)

    # 더 이상 필요하지 않은 weekday 열 삭제
    dataset.drop('weekday', axis=1, inplace=True)
    
     # 공휴일 처리
    unique_years = dataset['year'].unique()
    year_for_holidays = []
    for i in unique_years:
        year_for_holidays.append(pytimekr.holidays(year=i))
    
    all_holidays = sum(year_for_holidays, [])

    dataset['holiday'] = 0
    holiday_rows = dataset[dataset['timestamp'].isin(all_holidays)]
    dataset.loc[holiday_rows.index, 'holiday'] = 1

    return dataset

In [59]:
train_df = process_dataset(train_df)
test_df = process_dataset(test_df)

In [60]:
international_trade_df['year'] = international_trade_df['기간'].apply(lambda x : int(x[0:4]))
international_trade_df['month'] = international_trade_df['기간'].apply(lambda x : int(x[5:7]))

In [61]:
# Define the replacements
replacements = {
    '양배추': 'CB',
    '감귤': 'TG',
    '당근': 'CR',
    '꽃양배추와 브로콜리(broccoli)': 'BC'
}

for original, replacement in replacements.items():
    international_trade_df['품목명'] = international_trade_df['품목명'].replace(original, replacement)


# Filter the dataframe to only keep rows with the new abbreviations
filtered_tradedata = international_trade_df[international_trade_df['품목명'].isin(replacements.values())]


In [62]:
# Merge the international trade data with the train and test datasets
train_merged = train_df.merge(filtered_tradedata, how='left', left_on=['year', 'month', 'item'], right_on=['year', 'month', '품목명'])

# Drop the 'Item_Code' column as it is redundant after merging
train_merged = train_merged.drop(columns=['품목명'])

In [63]:
march_trade = filtered_tradedata[filtered_tradedata['month']==3]
march_trade = march_trade.groupby(march_trade['품목명']).mean(['수출 중량','수출 금액','수입 중량', '수입 금액', '무역수지']).reset_index()
march_trade = march_trade.rename(columns={'품목명': 'item'})
march_trade.drop(columns=['year','month'],inplace=True)
test_merge = test_df.merge(march_trade, on='item',how='left')
# test.drop(columns=['year_y','month_y'])

In [64]:
# 카테고리 변수 원핫인코딩
# One-hot encoding of categorical variables in both train and test datasets
train_final = pd.get_dummies(train_merged, columns=['corporation', 'location'])
test_final = pd.get_dummies(test_merge, columns=[ 'corporation', 'location'])

train_final[train_final['month']==3].groupby(['item','isSunday'])['supply(kg)'].mean()

item  isSunday
BC    0            2316.326707
      1               0.000000
CB    0           48795.203670
      1               0.000000
CR    0           13303.179554
      1               0.000000
RD    0           66363.303899
      1               0.000000
TG    0            8630.705440
      1               0.000000
Name: supply(kg), dtype: float64

In [65]:
supply_mean = train_final[train_final['month'] == 3].groupby(['item', 'week'])['supply(kg)'].mean().reset_index()

In [66]:
# test_final과 supply_mean 병합
merged_df = test_final.merge(supply_mean, left_on=['item', 'week'], right_on=['item', 'week'], how='left')

# supply_mean의 열 이름 리스트 (가정)
new_column_order= train_final.columns
new_column_order = new_column_order.drop(['price(원/kg)','기간'])
# 'item' 열 바로 다음에 supply_mean의 열들이 오도록 열 순서를 조정

# 조정된 열 순서로 데이터프레임 재배열
test_final = merged_df[new_column_order]

In [67]:
# 결측치를 0으로 대체
train_final.fillna(0, inplace=True)
test_final.fillna(0, inplace=True)

In [68]:
def trade_feature(df):
    # 무역 규모
        df['무역 규모'] = df['수출 중량'] + df['수입 중량']
        df['평균 수출 가격'] = df['수출 금액'] / df['수출 중량']
        df['평균 수입 가격'] = df['수입 금액'] / df['수입 중량']
        df['무역수지 비율'] = (df['수출 금액'] - df['수입 금액']) / df['무역수지']
        return(df)

In [69]:
train_final = trade_feature(train_final)
test_final = trade_feature(test_final)

In [70]:
# 결측치를 0으로 대체
train_final.fillna(0, inplace=True)
test_final.fillna(0, inplace=True)