In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from scipy.stats import uniform, randint


import random
import os

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [2]:
# 데이터 호출
train = pd.read_csv("./코드_정리/new_ohed_train_1125.csv")

In [3]:
train = train[train["timestamp"].map(lambda x : not x.startswith("2023"))]

In [4]:
test = train[train["timestamp"].map(lambda x : x.startswith("2022-11") or x.startswith("2022-12"))]

In [5]:
train.head()

Unnamed: 0,ID,timestamp,BC,CB,CR,RD,TG,A,B,C,...,Sunday,Thursday,Tuesday,Wednesday,supply_sum,avg_supply,avg_price,division,before_price,group
0,BC_A_J_20190101,2019-01-01,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,36832.0,0.0,0.0,0.0,0,BC_A_J
1,BC_A_J_20190102,2019-01-02,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,44896.0,0.0,0.0,0.0,0,BC_A_J
2,BC_A_J_20190103,2019-01-03,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,51520.0,6776.0,2537.2,333.696957,0,BC_A_J
3,BC_A_J_20190104,2019-01-04,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,52256.0,12100.0,2811.8,651.078919,2559,BC_A_J
4,BC_A_J_20190105,2019-01-05,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,52232.0,10827.2,2252.4,466.901234,2425,BC_A_J


In [6]:
test.head()

Unnamed: 0,ID,timestamp,BC,CB,CR,RD,TG,A,B,C,...,Sunday,Thursday,Tuesday,Wednesday,supply_sum,avg_supply,avg_price,division,before_price,group
1400,BC_A_J_20221101,2022-11-01,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,13032.0,1158.0,1977.0,175.672652,3761,BC_A_J
1401,BC_A_J_20221102,2022-11-02,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,13768.0,1098.0,3083.5,245.909573,3010,BC_A_J
1402,BC_A_J_20221103,2022-11-03,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,15336.0,1268.0,3057.75,252.818662,2987,BC_A_J
1403,BC_A_J_20221104,2022-11-04,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17104.0,1482.0,3574.0,309.674228,2947,BC_A_J
1404,BC_A_J_20221105,2022-11-05,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,18688.0,1860.0,3866.75,384.854185,3265,BC_A_J


In [7]:
# 데이터 shape 확인
train.shape, test.shape

((56979, 30), (2379, 30))

In [8]:
# test의 group을 확인하여 필요한 값 확인
need_id = test["group"].unique()

In [9]:
need_id

array(['BC_A_J', 'BC_A_S', 'BC_B_J', 'BC_B_S', 'BC_C_J', 'BC_C_S',
       'BC_D_J', 'BC_E_J', 'BC_E_S', 'CB_A_J', 'CB_A_S', 'CB_D_J',
       'CB_E_J', 'CB_F_J', 'CR_A_J', 'CR_B_J', 'CR_C_J', 'CR_D_J',
       'CR_D_S', 'CR_E_J', 'CR_E_S', 'RD_A_J', 'RD_A_S', 'RD_C_S',
       'RD_D_J', 'RD_D_S', 'RD_E_J', 'RD_E_S', 'RD_F_J', 'TG_A_J',
       'TG_A_S', 'TG_B_J', 'TG_B_S', 'TG_C_J', 'TG_C_S', 'TG_D_J',
       'TG_D_S', 'TG_E_J', 'TG_E_S'], dtype=object)

In [10]:
train = train.set_index("group")
test = test.set_index("group")

In [11]:
items = ['BC', 'CB', 'CR', 'RD', 'TG']
cors = ['A', 'B', 'C', 'D', 'E', 'F']
locs = ["J", "S"]

In [12]:
# 아이디별로 데이터 나누기
train_list = []
test_list = []

for item in items :
    for cor in cors :
        for loc in locs :
            group = f"{item}_{cor}_{loc}"
            
            # train의 index에 존재하는 조합의 group으로만 코드 진행
            if group in train.index :
            
                locals()[f"train_{group}"] = train.loc[group]
                locals()[f"test_{group}"] = test.loc[group]
                
                train_list.append(f"train_{group}")
                test_list.append(f"test_{group}")
                
            else :
                print(f"{group} was passed")

BC_D_S was passed
BC_F_J was passed
BC_F_S was passed
CB_B_J was passed
CB_B_S was passed
CB_C_J was passed
CB_C_S was passed
CB_D_S was passed
CB_E_S was passed
CB_F_S was passed
CR_A_S was passed
CR_B_S was passed
CR_C_S was passed
CR_F_J was passed
CR_F_S was passed
RD_B_J was passed
RD_B_S was passed
RD_C_J was passed
RD_F_S was passed
TG_F_J was passed
TG_F_S was passed


In [13]:
# 리스트의 요소 확인
test_list[0], train_list[0]

('test_BC_A_J', 'train_BC_A_J')

In [14]:
# 리스트의 길이 확인
len(test_list), len(train_list)

(39, 39)

In [15]:
# ExtraTree 학습 코드

# while문을 조절할 r정의
r = 1

# 결과 받을 리스트 생성
mean_score_list = []
mean_rmse_list = []

# r 이 5미만일 때까지 반복
while r != 5 :
    
    # 결과값 받을 빈 데이터 프레임 생성
    answers_df = pd.DataFrame()

    # 결과 받을 리스트, 딕셔너리 생성
    score_list = []
    rmse_dict = {}
    
    for i in range(39) :

        # locals함수를 이용하여 리스트에서 i번째 test, train 변수 호출

        test = locals()[f"{test_list[i]}"] 
        test = test.reset_index()
        test = test.set_index("ID")
        test = test.drop("group", axis = 1)
        test = test.drop("timestamp", axis = 1)
     
        train = locals()[f"{train_list[i]}"]
        train = train.reset_index()
        train = train.set_index("ID")
        train = train.drop("group", axis = 1)
        train = train.drop("timestamp", axis = 1)
     
        # 사용중인 데이터 출력
        print(f"test_list : {test_list[i]}, train_list : {train_list[i]}, i : {i}, r : {r}")
        
        # 데이터 분할        
        x = train.drop(["price"], axis = 1)
        y = train["price"]

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1 ,random_state=seed_everything(42), shuffle=False)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=seed_everything(42), shuffle=False)
                  
        # 랜덤서치 파라미터 설정 
        params = {"min_impurity_decrease" : uniform(0.0001 , 0.001),
          "max_depth" : randint(5 , 70),
          "min_samples_split" : randint(2 , 50),
          "min_samples_leaf" : randint(1,  50)}

        et = RandomizedSearchCV(ExtraTreesRegressor(random_state = seed_everything(42)), 
                                params, n_iter = 100, n_jobs = -1, random_state = seed_everything(42))
         

        # 모델 훈련 및 score 저장
        et.fit(x_train, y_train)

        mean_test_socre = np.max(et.cv_results_["mean_test_score"])
        score_list.append(mean_test_socre)
                                
        # 모델의 best_params, mean_test_score 출력
        print(f"\t {r}-model's Best prams : {et.best_params_}")
        print(f"\t mean_test_socre : {mean_test_socre}")
                        
        # x_val에 대한 예측 실행
        predict = et.predict(x_val)
        
        # rmse 추출
        mse = mean_squared_error(y_val, predict)
        rmse = mse** (1/2)
        
        # rmse 출력
        print(f"\t rmse : {rmse}")
        print()
        
        # rmse를 딕셔너리의 형태로 저장
        rmse_dict[f"{train_list[i]}"] = rmse
        
        # 테스트 데이터로 predict 진행 및 결과를 데이터프레임으로 저장
        test = test.drop("price", axis = 1)
                                
        answer = et.predict(test)
        answer = pd.DataFrame(answer, columns = ["answer"], index = test.index)

        answers_df = pd.concat((answers_df, answer), axis = 0)
    
    # r번째 데이터프레임을 csv파일로 저장
    answers_df.to_csv(f"./코드_정리/predict_ExtraTreetRegressor_new_data_1205{r}.csv", encoding="utf-8")
    
    # r번째 score 및 rmse를 리스트에 저장
    mean_score = np.mean(score_list) 
    mean_score_list.append(mean_score)
    mean_rmse_list.append(rmse_dict)
    
    # r을 1 증가시켜 while문 조절
    r += 1                         

test_list : test_BC_A_J, train_list : train_BC_A_J, i : 0, r : 1
	 1-model's Best prams : {'max_depth': 26, 'min_impurity_decrease': 0.0008300393165618186, 'min_samples_leaf': 3, 'min_samples_split': 13}
	 mean_test_socre : 0.9123691862334526
	 rmse : 193.14534291503557

test_list : test_BC_A_S, train_list : train_BC_A_S, i : 1, r : 1
	 1-model's Best prams : {'max_depth': 28, 'min_impurity_decrease': 0.0007508884729488529, 'min_samples_leaf': 2, 'min_samples_split': 25}
	 mean_test_socre : 0.847786013762495
	 rmse : 380.9522220775695

test_list : test_BC_B_J, train_list : train_BC_B_J, i : 2, r : 1
	 1-model's Best prams : {'max_depth': 26, 'min_impurity_decrease': 0.0008300393165618186, 'min_samples_leaf': 3, 'min_samples_split': 13}
	 mean_test_socre : 0.8350397790648489
	 rmse : 498.7872445338647

test_list : test_BC_B_S, train_list : train_BC_B_S, i : 3, r : 1
	 1-model's Best prams : {'max_depth': 26, 'min_impurity_decrease': 0.0008300393165618186, 'min_samples_leaf': 3, 'min_sam

	 1-model's Best prams : {'max_depth': 23, 'min_impurity_decrease': 0.0008259556788702394, 'min_samples_leaf': 3, 'min_samples_split': 21}
	 mean_test_socre : 0.8859656166643702
	 rmse : 1089.3298184752214

test_list : test_TG_B_J, train_list : train_TG_B_J, i : 31, r : 1
	 1-model's Best prams : {'max_depth': 36, 'min_impurity_decrease': 0.000652893089071328, 'min_samples_leaf': 4, 'min_samples_split': 12}
	 mean_test_socre : 0.8051653723093262
	 rmse : 2409.750993740213

test_list : test_TG_B_S, train_list : train_TG_B_S, i : 32, r : 1
	 1-model's Best prams : {'max_depth': 36, 'min_impurity_decrease': 0.000652893089071328, 'min_samples_leaf': 4, 'min_samples_split': 12}
	 mean_test_socre : 0.899797169264027
	 rmse : 1051.5918492787516

test_list : test_TG_C_J, train_list : train_TG_C_J, i : 33, r : 1
	 1-model's Best prams : {'max_depth': 26, 'min_impurity_decrease': 0.0008300393165618186, 'min_samples_leaf': 3, 'min_samples_split': 13}
	 mean_test_socre : 0.8290576579441173
	 rmse 

	 2-model's Best prams : {'max_depth': 28, 'min_impurity_decrease': 0.0007508884729488529, 'min_samples_leaf': 2, 'min_samples_split': 25}
	 mean_test_socre : 0.8059690714467832
	 rmse : 84.20505537404779

test_list : test_RD_C_S, train_list : train_RD_C_S, i : 23, r : 2
	 2-model's Best prams : {'max_depth': 5, 'min_impurity_decrease': 0.0002732943200708458, 'min_samples_leaf': 1, 'min_samples_split': 48}
	 mean_test_socre : 0.9731388503512515
	 rmse : 0.0

test_list : test_RD_D_J, train_list : train_RD_D_J, i : 24, r : 2
	 2-model's Best prams : {'max_depth': 28, 'min_impurity_decrease': 0.0007508884729488529, 'min_samples_leaf': 2, 'min_samples_split': 25}
	 mean_test_socre : 0.874424170273004
	 rmse : 74.93826170156981

test_list : test_RD_D_S, train_list : train_RD_D_S, i : 25, r : 2
	 2-model's Best prams : {'max_depth': 28, 'min_impurity_decrease': 0.0007508884729488529, 'min_samples_leaf': 2, 'min_samples_split': 25}
	 mean_test_socre : 0.8553900436859913
	 rmse : 112.264293206

	 3-model's Best prams : {'max_depth': 28, 'min_impurity_decrease': 0.0007508884729488529, 'min_samples_leaf': 2, 'min_samples_split': 25}
	 mean_test_socre : 0.8172135247175205
	 rmse : 1313.9450955974337

test_list : test_CR_B_J, train_list : train_CR_B_J, i : 15, r : 3
	 3-model's Best prams : {'max_depth': 28, 'min_impurity_decrease': 0.0007508884729488529, 'min_samples_leaf': 2, 'min_samples_split': 25}
	 mean_test_socre : 0.8788823294200663
	 rmse : 24.12241357923809

test_list : test_CR_C_J, train_list : train_CR_C_J, i : 16, r : 3
	 3-model's Best prams : {'max_depth': 59, 'min_impurity_decrease': 0.0010832308858067883, 'min_samples_leaf': 3, 'min_samples_split': 38}
	 mean_test_socre : 0.8535654902254437
	 rmse : 68.93383077533703

test_list : test_CR_D_J, train_list : train_CR_D_J, i : 17, r : 3
	 3-model's Best prams : {'max_depth': 28, 'min_impurity_decrease': 0.0007508884729488529, 'min_samples_leaf': 2, 'min_samples_split': 25}
	 mean_test_socre : 0.887940055892971
	 rmse

	 4-model's Best prams : {'max_depth': 36, 'min_impurity_decrease': 0.000951136671516857, 'min_samples_leaf': 3, 'min_samples_split': 19}
	 mean_test_socre : 0.8660447279571333
	 rmse : 590.16146893888

test_list : test_BC_E_J, train_list : train_BC_E_J, i : 7, r : 4
	 4-model's Best prams : {'max_depth': 26, 'min_impurity_decrease': 0.0008300393165618186, 'min_samples_leaf': 3, 'min_samples_split': 13}
	 mean_test_socre : 0.8790795745402905
	 rmse : 295.57979744340156

test_list : test_BC_E_S, train_list : train_BC_E_S, i : 8, r : 4
	 4-model's Best prams : {'max_depth': 26, 'min_impurity_decrease': 0.0008300393165618186, 'min_samples_leaf': 3, 'min_samples_split': 13}
	 mean_test_socre : 0.8649852025802112
	 rmse : 460.4902295499161

test_list : test_CB_A_J, train_list : train_CB_A_J, i : 9, r : 4
	 4-model's Best prams : {'max_depth': 36, 'min_impurity_decrease': 0.000652893089071328, 'min_samples_leaf': 4, 'min_samples_split': 12}
	 mean_test_socre : 0.5316709498541666
	 rmse : 49.

	 4-model's Best prams : {'max_depth': 26, 'min_impurity_decrease': 0.0008300393165618186, 'min_samples_leaf': 3, 'min_samples_split': 13}
	 mean_test_socre : 0.6619211979020573
	 rmse : 2143.243234514279

test_list : test_TG_E_S, train_list : train_TG_E_S, i : 38, r : 4
	 4-model's Best prams : {'max_depth': 36, 'min_impurity_decrease': 0.000951136671516857, 'min_samples_leaf': 3, 'min_samples_split': 19}
	 mean_test_socre : 0.8023956464700523
	 rmse : 1193.9833245210527



In [16]:
# mean_rmse_list 확인
mean_rmse_list

[{'train_BC_A_J': 193.14534291503557,
  'train_BC_A_S': 380.9522220775695,
  'train_BC_B_J': 498.7872445338647,
  'train_BC_B_S': 103.9674222595631,
  'train_BC_C_J': 558.2059905403723,
  'train_BC_C_S': 0.12066047159554663,
  'train_BC_D_J': 596.7463599159821,
  'train_BC_E_J': 282.7482072906707,
  'train_BC_E_S': 450.9658497678171,
  'train_CB_A_J': 59.89167741615253,
  'train_CB_A_S': 0.12164402433694374,
  'train_CB_D_J': 108.30745918907795,
  'train_CB_E_J': 179.17992623808277,
  'train_CB_F_J': 155.48238532496504,
  'train_CR_A_J': 1313.9450955974337,
  'train_CR_B_J': 24.12241357923809,
  'train_CR_C_J': 89.38278488773139,
  'train_CR_D_J': 56.31164544986941,
  'train_CR_D_S': 1.684546475344392,
  'train_CR_E_J': 67.30382980217121,
  'train_CR_E_S': 0.0,
  'train_RD_A_J': 656.9626966126767,
  'train_RD_A_S': 84.20505537404779,
  'train_RD_C_S': 0.03686239014293692,
  'train_RD_D_J': 74.7371079780897,
  'train_RD_D_S': 112.26429320693104,
  'train_RD_E_J': 172.98635429743294,
  '

In [17]:
# 아이템별로 확인하기 위해 items 확인
items

['BC', 'CB', 'CR', 'RD', 'TG']

In [18]:
# 아이템별로 rmse 확인
rmse_dict_ = {}
for i in range(len(mean_rmse_list)) :
    data = mean_rmse_list[i]
    locals()[f"mean_rmse_list_{i}"] = pd.DataFrame(data.items(), columns = ["group", "RMSE"])
    locals()[f"mean_rmse_list_{i}"]["group"] = locals()[f"mean_rmse_list_{i}"]["group"].map(lambda x : x[6:])
    for item in items :
        locals()[f"mean_rmse_list_{i}_{item}"] = locals()[f"mean_rmse_list_{i}"][locals()[f"mean_rmse_list_{i}"]["group"].map(lambda x : x.startswith(item))]
        
        rmse_dict_[item] = locals()[f"mean_rmse_list_{i}_{item}"]["RMSE"].mean()

In [19]:
# rmse_dict_ 확인
rmse_dict_

{'BC': 344.63967253659735,
 'CB': 99.27588092669056,
 'CR': 219.2060857802264,
 'RD': 165.6072427851816,
 'TG': 1784.9847234421638}

In [20]:
# rmse_dict_에 대한 데이터프레임 생성
rmse_dict_df = pd.DataFrame(rmse_dict_.items(), columns = ["group", "RMSE"])

In [21]:
rmse_dict_df["model"] = "ExtraTree"

In [22]:
# 최종 데이터프레임 확인
rmse_dict_df

Unnamed: 0,group,RMSE,model
0,BC,344.639673,ExtraTree
1,CB,99.275881,ExtraTree
2,CR,219.206086,ExtraTree
3,RD,165.607243,ExtraTree
4,TG,1784.984723,ExtraTree
