<a href="https://colab.research.google.com/github/UnpackJungHo/XRSimulator_Osaka/blob/Learning_AI/XGBoost_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

class WeatherPredictor:
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.label_encoders = {}
        self.sequence_length = 12  # 24에서 12로 감소 (더 최근 데이터에 집중)

    def load_and_preprocess_data(self, training=True):
        """데이터 로드 및 전처리"""
        try:
            if training:
                # 학습용 데이터 (2020-2022)
                df_2020 = pd.read_excel('2020_weather.xlsx', dtype={'DateTime(YYYYMMDDHHMI)': str})
                df_2021 = pd.read_excel('2021_weather.xlsx', dtype={'DateTime(YYYYMMDDHHMI)': str})
                df_2022 = pd.read_excel('2022_weather.xlsx', dtype={'DateTime(YYYYMMDDHHMI)': str})
                print("학습 데이터 로드 완료 (2020-2022)")
                df = pd.concat([df_2020, df_2021, df_2022], ignore_index=True)
            else:
                # 검증용 데이터 (2023)
                df = pd.read_excel('2023_weather.xlsx', dtype={'DateTime(YYYYMMDDHHMI)': str})
                print("검증 데이터 로드 완료 (2023)")

            # 전처리 수행
            df = self._basic_preprocessing(df)
            df = self._create_advanced_features(df)
            df = self._handle_outliers(df)
            df = self._handle_missing_values(df)

            return df

        except Exception as e:
            print("데이터 로드 중 오류 발생:")
            print(e)
            raise e

    def _basic_preprocessing(self, df):
        """기본 전처리 작업"""
        try:
            # DateTime 변환
            df['DateTime(YYYYMMDDHHMI)'] = pd.to_datetime(df['DateTime(YYYYMMDDHHMI)'],
                                                        format='%Y%m%d%H%M')

            # 시간 관련 특성 추가
            df['hour'] = df['DateTime(YYYYMMDDHHMI)'].dt.hour
            df['month'] = df['DateTime(YYYYMMDDHHMI)'].dt.month
            df['day'] = df['DateTime(YYYYMMDDHHMI)'].dt.day
            df['dayofweek'] = df['DateTime(YYYYMMDDHHMI)'].dt.dayofweek
            df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
            df['season'] = pd.cut(df['month'], bins=[0,3,6,9,12], labels=['winter','spring','summer','fall'])

            # 순환 시간 특성
            df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
            df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
            df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
            df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

            # 범주형 변수 인코딩
            categorical_columns = ['WW', 'CT', 'season']
            for col in categorical_columns:
                if df[col].dtype == 'object' or df[col].dtype.name == 'category':
                    if col not in self.label_encoders:
                        self.label_encoders[col] = LabelEncoder()
                        self.label_encoders[col].fit(df[col].astype(str))
                    else:
                        # 새로운 카테고리 처리
                        unique_labels = set(df[col].astype(str).unique())
                        known_labels = set(self.label_encoders[col].classes_)
                        new_labels = unique_labels - known_labels

                        if new_labels:
                            print(f"\n{col}에서 발견된 새로운 레이블: {new_labels}")
                            for new_label in new_labels:
                                df.loc[df[col].astype(str) == new_label, col] = self.label_encoders[col].classes_[0]

                    df[col] = self.label_encoders[col].transform(df[col].astype(str))

            return df

        except Exception as e:
            print("기본 전처리 중 오류 발생:")
            print(e)
            raise e

    def _create_advanced_features(self, df):
        """고급 특성 생성"""
        # 시계열 특성
        for col in ['TA', 'WS', 'HM', 'RN', 'WD']:
            # 이동평균 (더 짧은 시간 윈도우 추가)
            df[f'{col}_MA1'] = df[col].rolling(window=1).mean()
            df[f'{col}_MA2'] = df[col].rolling(window=2).mean()
            df[f'{col}_MA3'] = df[col].rolling(window=3).mean()
            df[f'{col}_MA6'] = df[col].rolling(window=6).mean()

            # 변화율 (더 세밀한 변화 감지)
            df[f'{col}_change'] = df[col].diff()
            df[f'{col}_change_rate'] = df[col].pct_change()
            df[f'{col}_change_acc'] = df[col].diff().diff()  # 변화 가속도

            # 시간별 통계
            df[f'{col}_hour_mean'] = df.groupby('hour')[col].transform('mean')
            df[f'{col}_hour_std'] = df.groupby('hour')[col].transform('std')

            # 시간대별 특성
            morning_mask = (df['hour'] >= 6) & (df['hour'] < 12)
            afternoon_mask = (df['hour'] >= 12) & (df['hour'] < 18)
            evening_mask = (df['hour'] >= 18) & (df['hour'] < 24)
            night_mask = (df['hour'] < 6)

            df[f'{col}_morning_avg'] = df[col][morning_mask].mean()
            df[f'{col}_afternoon_avg'] = df[col][afternoon_mask].mean()
            df[f'{col}_evening_avg'] = df[col][evening_mask].mean()
            df[f'{col}_night_avg'] = df[col][night_mask].mean()

            # 일교차 관련 특성
            df[f'{col}_daily_range'] = df.groupby(df['DateTime(YYYYMMDDHHMI)'].dt.date)[col].transform(lambda x: x.max() - x.min())
            df[f'{col}_daily_max'] = df.groupby(df['DateTime(YYYYMMDDHHMI)'].dt.date)[col].transform('max')
            df[f'{col}_daily_min'] = df.groupby(df['DateTime(YYYYMMDDHHMI)'].dt.date)[col].transform('min')

            # 계절성 특성
            df[f'{col}_season_mean'] = df.groupby('season')[col].transform('mean')
            df[f'{col}_season_std'] = df.groupby('season')[col].transform('std')

        return df

    def _handle_outliers(self, df):
        """이상치 처리"""
        numerical_columns = ['TA', 'WS', 'HM', 'RN']
        for col in numerical_columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df[col] = df[col].clip(lower_bound, upper_bound)
        return df

    def _handle_missing_values(self, df):
        """결측치 처리"""
        numerical_columns = ['TA', 'WS', 'HM', 'RN']
        for col in numerical_columns:
            # 시간대별 평균으로 결측치 처리
            df[col] = df.groupby('hour')[col].transform(
                lambda x: x.fillna(x.mean())
            )
        df.fillna(df.mean(), inplace=True)
        return df

    def create_sequences(self, data, target_col):
        """시계열 시퀀스를 특성으로 변환"""
        X, y = [], []
        for i in range(len(data) - self.sequence_length):
            sequence = data.iloc[i:i + self.sequence_length].values.flatten()
            X.append(sequence)
            y.append(data[target_col].iloc[i + self.sequence_length])
        return np.array(X), np.array(y)

    def train_models(self, df):
        """XGBoost 모델 학습"""
        target_columns = ['TA', 'RN', 'WS', 'HM', 'WD']
        feature_columns = [col for col in df.columns
                         if col not in ['DateTime(YYYYMMDDHHMI)'] + target_columns]

        for target in target_columns:
            print(f"\n{target} 모델 학습 중...")

            # 데이터 준비
            data = df[feature_columns + [target]].copy()
            data = data.replace([np.inf, -np.inf], np.nan)
            data = data.fillna(data.mean())

            # 데이터 스케일링
            scaler = MinMaxScaler()
            scaled_data = scaler.fit_transform(data)
            scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
            self.scalers[target] = scaler

            # 시퀀스 생성
            X, y = self.create_sequences(scaled_df, target)

            # 학습/검증 분할
            train_size = int(len(X) * 0.8)
            X_train, X_test = X[:train_size], X[train_size:]
            y_train, y_test = y[:train_size], y[train_size:]

            # XGBoost 모델 생성 및 학습
            model = XGBRegressor(
                n_estimators=200,
                max_depth=8,
                learning_rate=0.05,
                min_child_weight=3,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1,
                eval_metric='rmse',
                early_stopping_rounds=20
            )

            # eval_set 설정
            eval_set = [(X_test, y_test)]

            # 모델 학습
            model.fit(
                X_train,
                y_train,
                eval_set=eval_set,
                verbose=True
            )

            self.models[target] = model

            # 성능 평가
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            print(f"{target} 모델 성능:")
            print(f"MSE: {mse:.4f}")
            print(f"R2 Score: {r2:.4f}")

    def predict_and_evaluate(self, input_time, validation_data):
        """예측 수행 및 평가"""
        input_datetime = pd.to_datetime(input_time, format='%Y/%m/%d/%H:%M')

        # 입력 시퀀스 준비
        past_data = validation_data[
            validation_data['DateTime(YYYYMMDDHHMI)'] <= input_datetime
        ].tail(self.sequence_length)

        if len(past_data) < self.sequence_length:
            print(f"예측을 위해 최소 {self.sequence_length}시간의 데이터가 필요합니다.")
            return None

        predictions = []
        current_sequence = past_data.copy()

        # 예측할 변수들
        target_columns = ['TA', 'RN', 'WS', 'HM', 'WD']
        feature_columns = [col for col in current_sequence.columns
                        if col not in ['DateTime(YYYYMMDDHHMI)'] + target_columns]

        for hour in range(6):
            next_time = input_datetime + timedelta(hours=hour)

            hour_prediction = {
                'DateTime': next_time
            }

            # 각 타겟 변수에 대한 예측
            for target in target_columns:
                # 입력 데이터 준비
                data = current_sequence[feature_columns + [target]].copy()
                data = data.replace([np.inf, -np.inf], np.nan)
                data = data.fillna(data.mean())

                # 스케일링
                scaler = self.scalers[target]
                scaled_sequence = scaler.transform(data)

                # 시퀀스를 1차원으로 평탄화
                X = scaled_sequence.flatten().reshape(1, -1)

                # 예측
                scaled_pred = self.models[target].predict(X)[0]

                # 역스케일링을 위한 더미 데이터 생성
                dummy_data = np.zeros((1, data.shape[1]))
                dummy_data[0, -1] = scaled_pred
                pred = scaler.inverse_transform(dummy_data)[0, -1]

                # 시간대별 보정
                hour_of_day = (next_time.hour)
                if 6 <= hour_of_day < 12:  # 아침
                    pred = pred * 1.1  # 아침 시간대 가중치
                elif 12 <= hour_of_day < 18:  # 오후
                    pred = pred * 1.05  # 오후 시간대 가중치
                elif 18 <= hour_of_day < 24:  # 저녁
                    pred = pred * 0.95  # 저녁 시간대 가중치
                else:  # 새벽
                    pred = pred * 0.9  # 새벽 시간대 가중치

                hour_prediction[f'{target}_pred'] = pred

                # 실제값 찾기
                actual_row = validation_data[
                    validation_data['DateTime(YYYYMMDDHHMI)'] == next_time
                ]
                if not actual_row.empty:
                    hour_prediction[f'{target}_actual'] = actual_row[target].iloc[0]
                else:
                    hour_prediction[f'{target}_actual'] = None

            predictions.append(hour_prediction)

            # 시퀀스 업데이트 시 가중치 적용
            new_row = current_sequence.iloc[-1:].copy()
            for target in target_columns:
                if hour > 0:
                    weight = max(1 - (hour * 0.1), 0.5)  # 시간이 지날수록 가중치 감소
                    prev_actual = validation_data[
                        validation_data['DateTime(YYYYMMDDHHMI)'] == (next_time - timedelta(hours=1))
                    ][target].iloc[0] if not validation_data[
                        validation_data['DateTime(YYYYMMDDHHMI)'] == (next_time - timedelta(hours=1))
                    ].empty else hour_prediction[f'{target}_pred']
                    new_row[target] = (weight * prev_actual + (1-weight) * hour_prediction[f'{target}_pred'])
                else:
                    new_row[target] = hour_prediction[f'{target}_pred']

            current_sequence = pd.concat([current_sequence[1:], new_row])

        results_df = pd.DataFrame(predictions)

        # 예측 성능 평가
        print("\n예측 성능 평가:")
        for target in target_columns:
            mask = results_df[f'{target}_actual'].notna()
            if mask.any():
                mae = mean_absolute_error(
                    results_df[mask][f'{target}_actual'],
                    results_df[mask][f'{target}_pred']
                )
                mse = mean_squared_error(
                    results_df[mask][f'{target}_actual'],
                    results_df[mask][f'{target}_pred']
                )
                r2 = r2_score(
                    results_df[mask][f'{target}_actual'],
                    results_df[mask][f'{target}_pred']
                )

                print(f"\n{target} 예측 성능:")
                print(f"MAE: {mae:.4f}")
                print(f"MSE: {mse:.4f}")
                print(f"R2 Score: {r2:.4f}")

        return results_df

def main():
    # 예측기 인스턴스 생성
    predictor = WeatherPredictor()

    # 학습 데이터 로드 및 전처리
    print("학습 데이터 로드 및 전처리 중...")
    train_df = predictor.load_and_preprocess_data(training=True)

    # 모델 학습
    print("\n모델 학습 시작...")
    predictor.train_models(train_df)

    # 검증 데이터 로드
    print("\n검증 데이터 로드 중...")
    validation_df = predictor.load_and_preprocess_data(training=False)

    while True:
        # 사용자로부터 날짜 입력 받기
        input_time = input("\n예측할 날짜와 시간을 입력하세요 (형식: YYYY/MM/DD/HH:MM, 종료는 'q'): ")

        if input_time.lower() == 'q':
            break

        try:
            # 예측 수행 및 평가
            results = predictor.predict_and_evaluate(input_time, validation_df)

            if results is not None:
                print("\n예측 결과:")
                pd.set_option('display.max_columns', None)
                print(results[['DateTime'] +
                            [col for col in results.columns if 'pred' in col or 'actual' in col]])

        except Exception as e:
            print(f"오류 발생: {str(e)}")
            print("올바른 형식으로 다시 입력해주세요.")

if __name__ == "__main__":
    main()

학습 데이터 로드 및 전처리 중...
학습 데이터 로드 완료 (2020-2022)

모델 학습 시작...

TA 모델 학습 중...
[0]	validation_0-rmse:0.20381
[1]	validation_0-rmse:0.19374
[2]	validation_0-rmse:0.18416
[3]	validation_0-rmse:0.17506
[4]	validation_0-rmse:0.16636
[5]	validation_0-rmse:0.15814
[6]	validation_0-rmse:0.15031
[7]	validation_0-rmse:0.14288
[8]	validation_0-rmse:0.13584
[9]	validation_0-rmse:0.12917
[10]	validation_0-rmse:0.12280
[11]	validation_0-rmse:0.11673
[12]	validation_0-rmse:0.11100
[13]	validation_0-rmse:0.10554
[14]	validation_0-rmse:0.10034
[15]	validation_0-rmse:0.09539
[16]	validation_0-rmse:0.09068
[17]	validation_0-rmse:0.08623
[18]	validation_0-rmse:0.08202
[19]	validation_0-rmse:0.07800
[20]	validation_0-rmse:0.07418
[21]	validation_0-rmse:0.07057
[22]	validation_0-rmse:0.06713
[23]	validation_0-rmse:0.06387
[24]	validation_0-rmse:0.06078
[25]	validation_0-rmse:0.05783
[26]	validation_0-rmse:0.05500
[27]	validation_0-rmse:0.05234
[28]	validation_0-rmse:0.04983
[29]	validation_0-rmse:0.04743
[30]	v

KeyboardInterrupt: Interrupted by user