In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

## 데이터 전처리 

- DropColumns: 사용하지 않는 칼럼 제거
- DropRows: "중품" 등급은 사용할 데이터에서 제거
- ConvertDate: 날짜를 string에서 datetime 클래스로 변경
- DateTimeFEatures: ConvertDate에서 변경한 datetime클래스를 각각 피처별로 처리
-

In [9]:
data_raw = pd.read_csv("데이터.csv")
print(data_raw.shape)
print(data_raw.columns)

(394, 22)
Index(['DATE', '요일', '품목', '품종', '거래단위', '등급', '평균가격', '전일_x', '조회일자', '지역(시)',
       '지역(군)', '평균 기온(°C)', '최고 기온(°C)', '최저 기온(°C)', '평균 강수량(mm)',
       '평균 일조시간(hr)', '평균 습도(%)', '평균 적설량(cm)', '평균 순간최대풍속(m/s)', '품목명',
       '총반입량', '전일_y'],
      dtype='object')


In [10]:
#Drop Column
columns_to_drop = ['품목','품종','거래단위','등급','전일_x', "거래단위", "조회일자","지역(시)", "지역(군)", "품목명","전일_y"]

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop, errors='ignore')

drop_columns_transformer = DropColumns(columns_to_drop=columns_to_drop)


In [11]:
#Drop Rows
class DropRows(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, value_to_drop):
        self.column_name = column_name
        self.value_to_drop = value_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X[X[self.column_name] != self.value_to_drop]
        else:
            raise ValueError("Input should be a pandas DataFrame.")

drop_rows_transformer = DropRows(column_name="등급", value_to_drop="중품")

In [12]:
# Custom transformer to convert 'DATE' to datetime
class ConvertDate(BaseEstimator, TransformerMixin):
    def __init__(self, date_column):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.date_column] = pd.to_datetime(X[self.date_column], errors='coerce')
        return X

convert_date_transformer = ConvertDate(date_column="DATE")

In [14]:
class DateTimeFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, date_column):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        
        date_features = pd.DataFrame({
            'year': X[self.date_column].dt.year,
            'month': X[self.date_column].dt.month,
            'day': X[self.date_column].dt.day,
            'weekday': X[self.date_column].dt.weekday,
            'is_weekend': X[self.date_column].dt.weekday >= 5
        })
        
        X = X.drop(columns=[self.date_column])
        
        X = pd.concat([date_features, X], axis=1)
        
        return X


date_features_transformer = DateTimeFeatures(date_column="DATE")

In [15]:
# Pipeline
pipeline = Pipeline(steps=[
    ('convert_date', convert_date_transformer),  
    ('date_features', DateTimeFeatures(date_column="DATE")), 
    ('drop_rows', drop_rows_transformer),
    ('drop_columns', drop_columns_transformer),
])

column_name = ['year','month', 'day','weekday', 'is_weekend', '평균가격', '평균 기온(°C)', '최고 기온(°C)', '최저 기온(°C)', '평균 강수량(mm)', '평균 일조시간(hr)', '평균 습도(%)', '평균 적설량(cm)', '평균 순간최대풍속(m/s)', '총반입량']
transformed_df = pd.DataFrame(pipeline.fit_transform(data_raw), columns=column_name).reset_index(drop = True)

print(transformed_df.head(10), transformed_df.shape)

   year  month  day  weekday  is_weekend    평균가격  평균 기온(°C)  최고 기온(°C)  \
0  2024      8    1        3       False  30,133       29.9       33.2   
1  2024      7   31        2       False  29,888       29.4       33.3   
2  2024      7   30        1       False  29,965       28.6       31.9   
3  2024      7   29        0       False  30,094       27.4       27.8   
4  2024      7   26        4       False  30,152       28.7       32.7   
5  2024      7   25        3       False  30,871       28.8       32.2   
6  2024      7   24        2       False  31,421       28.1       30.9   
7  2024      7   23        1       False  31,544       26.4       27.3   
8  2024      7   22        0       False  31,396       26.6       27.9   
9  2024      7   19        4       False  30,342       27.0       30.8   

   최저 기온(°C)  평균 강수량(mm)  평균 일조시간(hr)  평균 습도(%)  평균 적설량(cm)  평균 순간최대풍속(m/s)  \
0       27.3         0.3          3.6      75.5         0.0             7.1   
1       27.0         0.0   

In [16]:
#Splitting
from sklearn.model_selection import train_test_split

X = transformed_df.drop(columns = ["평균가격"])
y = transformed_df["평균가격"].str.replace(',', '').astype(float)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=0)

## 데이터 트레이닝

사용된 베이스 모델
- Random Forest Regressor
- Gradient Boosting Regressor
- XGB Regressor


In [17]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from scipy.stats import uniform, randint

# Define base models with hyperparameter search spaces
base_models = [
    ('rf', RandomForestRegressor(random_state=19),
     {"n_estimators": randint(50, 300),
      "max_depth": randint(3, 10)}),
    ('gb', GradientBoostingRegressor(random_state=19),
     {"n_estimators": randint(50, 300),
      "learning_rate": uniform(0.01, 0.1)}),
    ('xgb', XGBRegressor(random_state=19),
     {"n_estimators": randint(50, 300),
      "learning_rate": uniform(0.01, 0.1),
      "max_depth": randint(3, 10)})
]

#base model optimization
optimized_base_models = []
for name, model, param_dist in base_models:
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, 
                                       n_iter=20, cv=5, random_state=19)
    random_search.fit(X_train, y_train)
    optimized_base_models.append((name, random_search.best_estimator_))

# Meta Model
meta_model = Ridge()

# Create and train stacking model
stacking_model = StackingRegressor(
    estimators=optimized_base_models,
    final_estimator=meta_model,
    cv=5
)
stacking_model.fit(X_train, y_train)

In [18]:
#추론 및 데이트
y_pred = stacking_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 2156941.854648771
R-squared Score: 0.7353248166575133
