# test train encoded csv 생성 코드 (11.10)_이혜승

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [7]:
train_df = pd.read_csv('../Desktop/competition/제주특산물가격예측/train.csv')
test_df = pd.read_csv('../Desktop/competition/제주특산물가격예측/test.csv')
international_trade_df = pd.read_csv('../Desktop/competition/제주특산물가격예측/international_trade.csv')
submission = pd.read_csv('../Desktop/competition/제주특산물가격예측/submission.csv')

In [9]:
import pandas as pd
from pytimekr import pytimekr 

def process_dataset(dataset):
    # timestamp 열을 datetime 객체로 변환
    dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])

    # 연도, 월, 일, 주, 요일 열 추가
    dataset['year'] = dataset['timestamp'].dt.isocalendar().year
    dataset['month'] = dataset['timestamp'].dt.month
    dataset['day'] = dataset['timestamp'].dt.day
    dataset['week'] = dataset['timestamp'].dt.isocalendar().week
    dataset['weekday'] = dataset['timestamp'].dt.isocalendar().day

    # 주말 및 평일 여부를 나타내는 열 추가
    dataset['isWeekday'] = ((dataset['weekday'] >= 1) & (dataset['weekday'] <= 5)).astype(int)
    dataset['isSaturday'] = (dataset['weekday'] == 6).astype(int)
    dataset['isSunday'] = (dataset['weekday'] == 7).astype(int)

    # 더 이상 필요하지 않은 weekday 열 삭제
    dataset.drop('weekday', axis=1, inplace=True)
    
     # 공휴일 처리
    unique_years = dataset['year'].unique()
    year_for_holidays = []
    for i in unique_years:
        year_for_holidays.append(pytimekr.holidays(year=i))
    
    all_holidays = sum(year_for_holidays, [])

    dataset['holiday'] = 0
    holiday_rows = dataset[dataset['timestamp'].isin(all_holidays)]
    dataset.loc[holiday_rows.index, 'holiday'] = 1

    return dataset

In [13]:
train = process_dataset(train_df)
test = process_dataset(train_df)
display(train)
display(test)

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day,week,isWeekday,isSaturday,isSunday,holiday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,0,0,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,1,1,0,0,0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,1,1,0,0,0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,1,1,0,0,0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,9,1,0,0,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,9,1,0,0,0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,9,1,0,0,1
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,9,1,0,0,0


Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day,week,isWeekday,isSaturday,isSunday,holiday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,0,0,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,1,1,0,0,0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,1,1,0,0,0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,1,1,0,0,0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,9,1,0,0,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,9,1,0,0,0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,9,1,0,0,1
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,9,1,0,0,0


In [18]:
international_trade_df.head()

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지,year,month
0,2019-01,토마토(신선한 것이나 냉장한 것으로 한정한다),356571,990,0,0,990,2019,1
1,2019-01,양파,821330,222,4003206,1118,-896,2019,1
2,2019-01,쪽파,60,1,93405,128,-127,2019,1
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562,2019,1
4,2019-01,방울다다기 양배추,0,0,7580,38,-38,2019,1


In [17]:
international_trade_df['year'] = international_trade_df['기간'].apply(lambda x : int(x[0:4]))
international_trade_df['month'] = international_trade_df['기간'].apply(lambda x : int(x[5:7]))

In [48]:
# Define the replacements
replacements = {
    '양배추': 'CB',
    '감귤': 'TG',
    '당근': 'CR',
    '꽃양배추와 브로콜리(broccoli)': 'BC'
}

for original, replacement in replacements.items():
    international_trade_df['품목명'] = international_trade_df['품목명'].replace(original, replacement)

# Filter the dataframe to only keep rows with the new abbreviations
filtered_tradedata = international_trade_df[international_trade_df['품목명'].isin(replacements.values())]


In [50]:
filtered_tradedata.tail()

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지,year,month
1236,2023-01,TG,81509,269,0,0,269,2023,1
1248,2023-02,BC,24,0,332640,352,-352,2023,2
1250,2023-02,CB,13188,13,377456,104,-91,2023,2
1253,2023-02,CR,22510,20,9260020,3758,-3737,2023,2
1264,2023-02,TG,6895,34,27765,98,-64,2023,2


In [51]:
# Merge the international trade data with the train and test datasets
train_merged = train.merge(filtered_tradedata, how='left', left_on=['year', 'month', 'item'], right_on=['year', 'month', '품목명'])

# Drop the 'Item_Code' column as it is redundant after merging
# train_merged = train.drop(columns=['품목명'])

# Check the first few rows of the merged train dataset and for any missing values introduced by the merge
train_merged = train_merged.drop(columns=['기간'])
# train_merged = train_merged.fillna(0)
train_merged.tail()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day,...,isWeekday,isSaturday,isSunday,holiday,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,...,1,0,0,0,,,,,,
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,...,1,0,0,0,,,,,,
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,...,1,0,0,1,,,,,,
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,...,1,0,0,0,,,,,,
59396,RD_F_J_20230303,2023-03-03,RD,F,J,427520.0,529.0,2023,3,3,...,1,0,0,0,,,,,,


In [45]:
import numpy as np

In [61]:

recent_trade_data = filtered_tradedata[
    filtered_tradedata['year'].isin([2019,2020,2021,2022])
]

# monthly_averages_recent = recent_trade_data.groupby(['month', '품목명'])
aggregated_trade_data = recent_trade_data.groupby(['year', 'month', '품목명']).mean().reset_index()
aggregated_trade_data
# .mean().reset_index()

# test_merged_with_recent_averages = test.merge(monthly_averages_recent, how='left', left_on=['month', 'item'], right_on=['month', '품목명'])

# test_merged_with_recent_averages = test_merged_with_recent_averages.drop(columns=['품목명'])

# test_merged_with_recent_averages =test_merged_with_recent_averages.drop(columns=['year_y'])
# test_merged =test_merged_with_recent_averages

# test_merged.head()


TypeError: agg function failed [how->mean,dtype->object]

In [60]:
recent_trade_data.to_csv("../Desktop/competition/제주특산물가격예측/recent_trade_data.csv")
train_merged.to_csv("../Desktop/competition/제주특산물가격예측/train_merged.csv")
test.to_csv("../Desktop/competition/제주특산물가격예측/test_to.csv")

In [30]:
recent_trade_data

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지,year,month
3,2019-01,BC,160,1,638913,563,-562,2019,1
5,2019-01,CB,184650,94,395802,90,4,2019,1
8,2019-01,CR,23150,22,7466150,2955,-2934,2019,1
17,2019-01,TG,58368,172,0,0,172,2019,1
28,2019-02,BC,780,1,396870,399,-398,2019,2
...,...,...,...,...,...,...,...,...,...
1187,2022-11,TG,1382242,1406,0,0,1406,2022,11
1197,2022-12,BC,3516,9,380160,380,-372,2022,12
1199,2022-12,CB,133572,84,299384,87,-3,2022,12
1202,2022-12,CR,21020,16,11060210,3674,-3658,2022,12


In [53]:
# 카테고리 변수 원핫인코딩
# One-hot encoding of categorical variables in both train and test datasets
train_encoded = pd.get_dummies(train_merged, columns=['corporation', 'location'])
test_encoded = pd.get_dummies(test_merged, columns=[ 'corporation', 'location'])

# Check the first few rows of train_encoded to confirm the changes
train_encoded.head()

Unnamed: 0,ID,timestamp,item,supply(kg),price(원/kg),year,month,day,weekday,public_holiday,...,수입 금액,무역수지,corporation_A,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S
0,TG_A_J_20190101,2019-01-01,TG,0.0,0.0,2019,1,1,1,1,...,0.0,172.0,1,0,0,0,0,0,1,0
1,TG_A_J_20190102,2019-01-02,TG,0.0,0.0,2019,1,2,2,0,...,0.0,172.0,1,0,0,0,0,0,1,0
2,TG_A_J_20190103,2019-01-03,TG,60601.0,1728.0,2019,1,3,3,0,...,0.0,172.0,1,0,0,0,0,0,1,0
3,TG_A_J_20190104,2019-01-04,TG,25000.0,1408.0,2019,1,4,4,0,...,0.0,172.0,1,0,0,0,0,0,1,0
4,TG_A_J_20190105,2019-01-05,TG,32352.0,1250.0,2019,1,5,5,0,...,0.0,172.0,1,0,0,0,0,0,1,0


In [54]:
test_encoded = test_encoded.rename(columns={'year_x': 'year'})
test_encoded.head()

Unnamed: 0,ID,timestamp,item,year,month,day,weekday,public_holiday,수출 중량,수출 금액,...,수입 금액,무역수지,corporation_A,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S
0,TG_A_J_20230304,2023-03-04,TG,2023,3,4,5,0,12674.5,59.0,...,23.0,36.0,1,0,0,0,0,0,1,0
1,TG_A_J_20230305,2023-03-05,TG,2023,3,5,6,0,12674.5,59.0,...,23.0,36.0,1,0,0,0,0,0,1,0
2,TG_A_J_20230306,2023-03-06,TG,2023,3,6,0,0,12674.5,59.0,...,23.0,36.0,1,0,0,0,0,0,1,0
3,TG_A_J_20230307,2023-03-07,TG,2023,3,7,1,0,12674.5,59.0,...,23.0,36.0,1,0,0,0,0,0,1,0
4,TG_A_J_20230308,2023-03-08,TG,2023,3,8,2,0,12674.5,59.0,...,23.0,36.0,1,0,0,0,0,0,1,0


In [55]:
test_encoded.to_csv('C:/Users/hyeseung/Desktop/4-2/Kuggle/제주특산물경진대회/test1110.csv', index=False)
train_encoded.to_csv('C:/Users/hyeseung/Desktop/4-2/Kuggle/제주특산물경진대회/train1110.csv', index=False)