<a href="https://colab.research.google.com/github/UnpackJungHo/XRSimulator_Osaka/blob/Learning_AI/LSTM_TEST_250110.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import math

# scikit-learn
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import joblib

# ---------------------------------------------
# 1. 직접 계산하는 평가 지표 (MSE, RMSE, MAE)
# ---------------------------------------------
def manual_mse(y_true, y_pred):
    """
    MSE = 평균 제곱 오차
    """
    diff = y_true - y_pred
    return np.mean(diff ** 2)

def manual_rmse(y_true, y_pred):
    """
    RMSE = sqrt(MSE)
    """
    return math.sqrt(manual_mse(y_true, y_pred))

def manual_mae(y_true, y_pred):
    """
    MAE = 평균 절대 오차
    """
    diff = y_true - y_pred
    return np.mean(np.abs(diff))

# ---------------------------------------------
# 2. 데이터 불러오기 및 전처리
# ---------------------------------------------
def load_and_preprocess_data_multiple(folder_path='/content/drive/MyDrive/DataSet', start_year=2014, end_year=2023):
    """
    2014~2023년의 XLSX 파일들을 순차적으로 불러와 하나로 합친다.
    - 온도(TA)는 -9도도 정상 값으로 간주하므로, TA 이외 컬럼만 -9 -> NaN + interpolate
    - STN 컬럼 등 불필요시 제거
    """
    all_dfs = []
    for year in range(start_year, end_year + 1):
        file_name = f"{year}_weather.xlsx"
        file_path = os.path.join(folder_path, file_name)

        if os.path.exists(file_path):
            df_temp = pd.read_excel(file_path)
            # DateTime 인덱스로 설정
            df_temp['DateTime'] = pd.to_datetime(df_temp['DateTime(YYYYMMDDHHMI)'], format='%Y%m%d%H%M')
            df_temp.set_index('DateTime', inplace=True)

            # TA 제외 컬럼만 -9 -> NaN
            if 'TA' in df_temp.columns:
                non_ta_cols = [col for col in df_temp.columns if col != 'TA']
                df_excl_ta = df_temp[non_ta_cols].copy()
                df_excl_ta.replace(-9, np.nan, inplace=True)
                df_excl_ta = df_excl_ta.infer_objects(copy=False)
                df_excl_ta.interpolate(method='time', inplace=True)
                df_temp[non_ta_cols] = df_excl_ta[non_ta_cols]
            else:
                # TA 자체가 없으면 전체 처리
                df_temp.replace(-9, np.nan, inplace=True)
                df_temp = df_temp.infer_objects(copy=False)
                df_temp.interpolate(method='time', inplace=True)

            # STN 등 불필요 컬럼 제거
            if 'STN' in df_temp.columns:
                df_temp.drop(columns=['STN'], inplace=True)

            all_dfs.append(df_temp)
        else:
            print(f"File not found: {file_path}")

    if len(all_dfs) == 0:
        raise ValueError("No data files found in the specified range.")

    df_merged = pd.concat(all_dfs)
    df_merged.sort_index(inplace=True)
    return df_merged

# ---------------------------------------------
# 3. 랜덤 포레스트 (TimeSeriesSplit + GridSearchCV)
# ---------------------------------------------
def train_random_forest_tscv(df):
    """
    1) 1시간 뒤 온도(TA_next1) 예측.
    2) TimeSeriesSplit(n_splits=3) 로 시계열 교차검증.
    3) GridSearchCV로 하이퍼파라미터 탐색.
    4) 최적 모델로 전체 데이터 재학습 -> 최종 MSE/RMSE/MAE 출력.
    """
    # 1시간 뒤 온도를 타깃으로 생성
    df['TA_next1'] = df['TA'].shift(-1)
    df_rf = df.dropna(subset=['TA', 'TA_next1'])

    # 사용 Features
    features = ['TA', 'RN', 'SD_TOT', 'CA_TOT', 'WD', 'WS', 'HM']
    X = df_rf[features].values
    y = df_rf['TA_next1'].values

    # 시계열 분할
    tscv = TimeSeriesSplit(n_splits=3)

    # 파라미터 그리드
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
    }

    # GridSearchCV 설정
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',  # -MSE
        cv=tscv,
        n_jobs=-1,
        verbose=2
    )

    # 하이퍼파라미터 탐색
    grid_search.fit(X, y)

    print("\n=== TimeSeriesSplit + GridSearchCV Result ===")
    print("Best Params:", grid_search.best_params_)
    print("Best Score (MSE):", -grid_search.best_score_)  # neg_mean_squared_error -> - 붙이면 실제 MSE

    # 최적 모델로 전체 데이터 재학습 (Optional but common)
    best_rf = grid_search.best_estimator_
    best_rf.fit(X, y)

    # 전체 데이터로 예측 -> 최종 성능
    y_pred_all = best_rf.predict(X)
    mse_final = manual_mse(y, y_pred_all)
    rmse_final = manual_rmse(y, y_pred_all)
    mae_final = manual_mae(y, y_pred_all)

    print("\n[RandomForest] Final Training on Entire Set")
    print(f"  MSE  : {mse_final:.4f}")
    print(f"  RMSE : {rmse_final:.4f}")
    print(f"  MAE  : {mae_final:.4f}\n")

    return best_rf

# ---------------------------------------------
# 4. LSTM 모델 학습 (기존 방식)
# ---------------------------------------------
def create_lstm_dataset(dataset, target, seq_length=24):
    Xs, ys = [], []
    for i in range(len(dataset) - seq_length):
        Xs.append(dataset.iloc[i:i+seq_length].values)
        ys.append(target.iloc[i+seq_length])
    return np.array(Xs), np.array(ys)

def train_lstm(df, seq_length=24):
    """
    1) TA, WS, HM -> 24시간 과거 -> 다음 시점 온도 예측
    2) train:val = 80:20 분할
    3) MSE/RMSE/MAE 출력
    """
    lstm_features = ['TA', 'WS', 'HM']
    df_lstm = df.dropna(subset=lstm_features)

    X_lstm, y_lstm = create_lstm_dataset(df_lstm[lstm_features], df_lstm['TA'], seq_length)

    split_idx = int(len(X_lstm) * 0.8)
    X_train, X_val = X_lstm[:split_idx], X_lstm[split_idx:]
    y_train, y_val = y_lstm[:split_idx], y_lstm[split_idx:]

    model = Sequential()
    model.add(LSTM(128, input_shape=(seq_length, len(lstm_features)), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse')
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(X_train, y_train,
              epochs=50,
              batch_size=32,
              validation_data=(X_val, y_val),
              callbacks=[early_stop],
              verbose=1)

    # 검증 세트 예측
    y_pred_val = model.predict(X_val).flatten()

    mse_lstm = manual_mse(y_val, y_pred_val)
    rmse_lstm = manual_rmse(y_val, y_pred_val)
    mae_lstm = manual_mae(y_val, y_pred_val)

    print("\n[LSTM] Validation Metrics")
    print(f"  MSE  : {mse_lstm:.4f}")
    print(f"  RMSE : {rmse_lstm:.4f}")
    print(f"  MAE  : {mae_lstm:.4f}\n")

    return model

# ---------------------------------------------
# 5. 전체 실행 흐름
# ---------------------------------------------
if __name__ == '__main__':
    # 1) 2014~2023년 데이터 불러오기
    train_df = load_and_preprocess_data_multiple(
        folder_path='/content/drive/MyDrive/DataSet',
        start_year=2014,
        end_year=2023
    )

    # 2) 랜덤 포레스트 (TimeSeriesSplit + GridSearchCV)
    rf_model = train_random_forest_tscv(train_df)

    # 3) LSTM 모델 학습 (기존 방식)
    lstm_model = train_lstm(train_df, seq_length=24)

    # 4) 모델 저장
    joblib.dump(rf_model, '/content/drive/MyDrive/DataSet/random_forest_model.pkl')
    lstm_model.save('/content/drive/MyDrive/DataSet/lstm_model.keras')

    print("Model training complete and saved.")


  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)
  df_excl_ta.interpolate(method='time', inplace=True)


Fitting 3 folds for each of 36 candidates, totalling 108 fits


  df_excl_ta.interpolate(method='time', inplace=True)



=== TimeSeriesSplit + GridSearchCV Result ===
Best Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best Score (MSE): 1.5451779797043317

[RandomForest] Final Training on Entire Set
  MSE  : 0.9362
  RMSE : 0.9676
  MAE  : 0.6404

Epoch 1/50


  super().__init__(**kwargs)


[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 8ms/step - loss: 32.0701 - val_loss: 0.5539
Epoch 2/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - loss: 2.8135 - val_loss: 0.3847
Epoch 3/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - loss: 2.8987 - val_loss: 0.4622
Epoch 4/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - loss: 2.3876 - val_loss: 0.3303
Epoch 5/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - loss: 1.8617 - val_loss: 0.3815
Epoch 6/50
[1m 941/2194[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m7s[0m 6ms/step - loss: 1.5408

In [27]:
   # 학습 완료 모델 저장
    # (1) 랜덤 포레스트 저장
joblib.dump(rf_model, 'random_forest_model.pkl')
    # (2) LSTM 저장 (h5 파일 등)
lstm_model.save('lstm_model.keras')

print("Model training complete and saved.")


Model training complete and saved.


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

import tensorflow as tf
from tensorflow.keras.models import load_model

# ---------------------------------------------
# 1. 평가지표 수동 계산 (MSE, MAE, R², RMSE 추가)
# ---------------------------------------------
def manual_mse(y_true, y_pred):
    """
    MSE = mean( (y_true - y_pred)^2 )
    """
    diff = y_true - y_pred
    return np.mean(diff**2)

def manual_rmse(y_true, y_pred):
    """
    RMSE = sqrt(MSE)
    """
    mse = manual_mse(y_true, y_pred)
    return math.sqrt(mse)

def manual_mae(y_true, y_pred):
    """
    MAE = mean( |y_true - y_pred| )
    """
    diff = y_true - y_pred
    return np.mean(np.abs(diff))

def manual_r2(y_true, y_pred):
    """
    R^2 = 1 - sum( (y_true - y_pred)^2 ) / sum( (y_true - y_mean)^2 )
    """
    diff_pred = y_true - y_pred
    diff_mean = y_true - np.mean(y_true)
    ss_res = np.sum(diff_pred**2)
    ss_tot = np.sum(diff_mean**2)
    return 1 - (ss_res / ss_tot)

# ---------------------------------------------
# 1. LSTM 입력 데이터셋 생성
# ---------------------------------------------
def create_lstm_dataset(dataset, target, seq_length=24):
    Xs, ys = [], []
    for i in range(len(dataset) - seq_length):
        Xs.append(dataset.iloc[i:i+seq_length].values)
        ys.append(target.iloc[i+seq_length])
    return np.array(Xs), np.array(ys)

# ---------------------------------------------
# 2. 데이터 불러오기 및 전처리
# ---------------------------------------------
def load_and_preprocess_single(file_path='/content/drive/MyDrive/DataSet/2024_weather.xlsx'):
    df = pd.read_excel(file_path)
    df['DateTime'] = pd.to_datetime(df['DateTime(YYYYMMDDHHMI)'], format='%Y%m%d%H%M')
    df.set_index('DateTime', inplace=True)

    if 'TA' in df.columns:
        non_ta_cols = [col for col in df.columns if col != 'TA']
        df_excl_ta = df[non_ta_cols].copy()

        df_excl_ta.replace(-9, np.nan, inplace=True)
        df_excl_ta = df_excl_ta.infer_objects(copy=False)
        df_excl_ta.interpolate(method='time', inplace=True)

        df[non_ta_cols] = df_excl_ta[non_ta_cols]
    else:
        df.replace(-9, np.nan, inplace=True)
        df = df.infer_objects(copy=False)
        df.interpolate(method='time', inplace=True)

    if 'STN' in df.columns:
        df.drop(columns=['STN'], inplace=True)

    df.sort_index(inplace=True)
    return df

# ---------------------------------------------
# 3. 6시간 예측 및 비교
# ---------------------------------------------
def predict_and_compare_6hours(df, model, user_date_str, seq_length=24, horizon=6):
    user_date = pd.to_datetime(user_date_str, format='%Y/%m/%d %H:%M', errors='coerce')
    if pd.isna(user_date):
        raise ValueError("입력한 날짜/시간이 올바르지 않습니다.")

    lstm_features = ['TA', 'WS', 'HM']
    start_time = user_date - pd.Timedelta(hours=seq_length)
    recent_data = df.loc[start_time:user_date].copy()

    recent_data = recent_data.infer_objects(copy=False)
    recent_data.interpolate(method='time', inplace=True)

    input_seq = recent_data[lstm_features].values[-seq_length:]

    prediction_result = []
    for hour_ahead in range(horizon):
        X_input = np.array([input_seq])
        ta_pred = model.predict(X_input, verbose=0)[0, 0]

        pred_time = user_date + pd.Timedelta(hours=hour_ahead)
        ta_actual = df.loc[pred_time, 'TA'] if pred_time in df.index else np.nan

        prediction_result.append([
            pred_time.strftime('%Y-%m-%d %H:%M'),
            ta_pred,
            ta_actual
        ])

        new_row = input_seq[-1].copy()
        new_row[0] = ta_pred
        input_seq = np.vstack([input_seq[1:], new_row])

    result_df = pd.DataFrame(prediction_result, columns=['DateTime', 'Predicted_TA', 'Actual_TA'])
    result_df['Absolute_Error'] = (result_df['Predicted_TA'] - result_df['Actual_TA']).abs()

    return result_df

# ---------------------------------------------
# 4. 메인 실행
# ---------------------------------------------
if __name__ == '__main__':
    # 미리 학습된 모델 불러오기
    rf_model = joblib.load('/content/drive/MyDrive/DataSet/random_forest_model.pkl')
    lstm_model = load_model('/content/drive/MyDrive/DataSet/lstm_model.keras')

    # 2024년 데이터 불러오기
    test_df_2024 = load_and_preprocess_single('/content/drive/MyDrive/DataSet/2024_weather.xlsx')

    # 사용자 입력 안내
    print("날짜와 시간을 입력하세요. (예: 2024/03/12 00:00)")
    print("종료하려면 q 를 입력하세요.")

    while True:
        user_input_time = input("Input date/time (YYYY/MM/DD HH:MM): ")

        if user_input_time.lower() == 'q':
            break

        try:
            result_6h = predict_and_compare_6hours(
                test_df_2024,
                lstm_model,
                user_input_time,
                seq_length=24,
                horizon=6
            )

            # 결과 출력
            print("\n=== 6시간 예측 결과 비교 ===")
            print(result_6h.to_string(index=False))

            # --- 평가지표 계산 ---
            df_valid = result_6h.dropna(subset=['Actual_TA'])
            if len(df_valid) > 0:
                y_true = df_valid['Actual_TA'].values
                y_pred = df_valid['Predicted_TA'].values

                mse_val = manual_mse(y_true, y_pred)
                rmse_val = manual_rmse(y_true, y_pred)
                mae_val = manual_mae(y_true, y_pred)
                r2_val = manual_r2(y_true, y_pred)

                print("\n[직접 구현한 평가지표]")
                print(f"MSE : {mse_val:.4f}")
                print(f"RMSE: {rmse_val:.4f}")
                print(f"MAE : {mae_val:.4f}")
                print(f"R^2 : {r2_val:.4f}")

        except Exception as e:
            print("입력 오류 또는 예측 실패. 다시 입력해 주세요.")
