<a href="https://colab.research.google.com/github/UnpackJungHo/XRSimulator_Osaka/blob/Learning_AI/LSTM_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# (필요하다면) joblib 등을 통해 RandomForest 모델 저장
import joblib

# ---------------------------------------------
# 1. 여러 파일(2014~2023) 불러와서 전처리 후 병합
# ---------------------------------------------
def load_and_preprocess_data_multiple(folder_path='.', start_year=2014, end_year=2023):
    all_dfs = []
    for year in range(start_year, end_year + 1):
        file_name = f"{year}_weather.xlsx"
        file_path = os.path.join(folder_path, file_name)
        if os.path.exists(file_path):
            df_temp = pd.read_excel(file_path)
            # DateTime 컬럼 시계열 변환
            df_temp['DateTime'] = pd.to_datetime(df_temp['DateTime(YYYYMMDDHHMI)'], format='%Y%m%d%H%M')
            df_temp.set_index('DateTime', inplace=True)

            # -9를 결측치로 처리 후 보간
            df_temp.replace(-9, np.nan, inplace=True)
            df_temp = df_temp.infer_objects(copy=False)  # FutureWarning 방지
            df_temp.interpolate(method='time', inplace=True)

            # 필요 없는 컬럼 제거
            if 'STN' in df_temp.columns:
                df_temp.drop(columns=['STN'], inplace=True)

            all_dfs.append(df_temp)
        else:
            print(f"File not found: {file_path}")

    if len(all_dfs) == 0:
        raise ValueError("No data files found in the specified range.")

    df_merged = pd.concat(all_dfs)
    df_merged.sort_index(inplace=True)
    return df_merged

# ---------------------------------------------
# 2. 랜덤 포레스트 학습 (예: TA_next1 예측)
# ---------------------------------------------
def train_random_forest(df):
    # 1시간 뒤 온도를 타깃으로 하는 컬럼
    df['TA_next1'] = df['TA'].shift(-1)
    df_rf = df.dropna(subset=['TA', 'TA_next1'])

    features = ['TA', 'RN', 'SD_TOT', 'CA_TOT', 'WD', 'WS', 'HM']
    X = df_rf[features]
    y = df_rf['TA_next1']

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, shuffle=False
    )

    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    y_pred_rf = rf_model.predict(X_val)
    mse_rf = mean_squared_error(y_val, y_pred_rf)
    print("[RandomForest] Validation MSE:", mse_rf)

    return rf_model

# ---------------------------------------------
# 3. LSTM 학습을 위한 시계열 데이터셋 구성
# ---------------------------------------------
def create_lstm_dataset(dataset, target, seq_length=24):
    Xs, ys = [], []
    for i in range(len(dataset) - seq_length):
        Xs.append(dataset.iloc[i:i+seq_length].values)
        ys.append(target.iloc[i+seq_length])
    return np.array(Xs), np.array(ys)

# ---------------------------------------------
# 4. LSTM 모델 학습
# ---------------------------------------------
def train_lstm(df, seq_length=24):
    lstm_features = ['TA', 'WS', 'HM']
    df_lstm = df.dropna(subset=lstm_features)

    X_lstm, y_lstm = create_lstm_dataset(
        df_lstm[lstm_features],
        df_lstm['TA'],
        seq_length
    )

    split_idx = int(len(X_lstm) * 0.8)
    X_train, X_val = X_lstm[:split_idx], X_lstm[split_idx:]
    y_train, y_val = y_lstm[:split_idx], y_lstm[split_idx:]

    model = Sequential()
    model.add(LSTM(64, input_shape=(seq_length, len(lstm_features)), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse')

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stop],
        verbose=1
    )

    y_pred_val = model.predict(X_val)
    mse_lstm = mean_squared_error(y_val, y_pred_val)
    print("[LSTM] Validation MSE:", mse_lstm)

    return model

# ---------------------------------------------
# 5. 학습 실행 (메인)
# ---------------------------------------------
if __name__ == '__main__':
    # 2014~2023년 학습용 데이터 불러오기
    train_df = load_and_preprocess_data_multiple(folder_path='.', start_year=2014, end_year=2023)

    # 랜덤 포레스트 학습
    rf_model = train_random_forest(train_df)

    # LSTM 학습
    lstm_model = train_lstm(train_df, seq_length=24)



In [18]:
   # 학습 완료 모델 저장
    # (1) 랜덤 포레스트 저장
joblib.dump(rf_model, 'random_forest_model.pkl')
    # (2) LSTM 저장 (h5 파일 등)
lstm_model.save('lstm_model.keras')

print("Model training complete and saved.")


Model training complete and saved.


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

import tensorflow as tf
from tensorflow.keras.models import load_model

def create_lstm_dataset(dataset, target, seq_length=24):
    Xs, ys = [], []
    for i in range(len(dataset) - seq_length):
        Xs.append(dataset.iloc[i:i+seq_length].values)
        ys.append(target.iloc[i+seq_length])
    return np.array(Xs), np.array(ys)

def load_and_preprocess_single(file_path='2024_weather.xlsx'):
    df = pd.read_excel(file_path)
    df['DateTime'] = pd.to_datetime(df['DateTime(YYYYMMDDHHMI)'], format='%Y%m%d%H%M')
    df.set_index('DateTime', inplace=True)

    df.replace(-9, np.nan, inplace=True)
    df = df.infer_objects(copy=False)
    df.interpolate(method='time', inplace=True)

    if 'STN' in df.columns:
        df.drop(columns=['STN'], inplace=True)

    df.sort_index(inplace=True)
    return df

def predict_and_compare_6hours(df, model, user_date_str, seq_length=24, horizon=6):
    """
    user_date_str: 'YYYY/MM/DD HH:MM' 형태의 문자열
    seq_length   : LSTM이 볼 과거 길이(기본 24시간)
    horizon      : 예측할 미래 시점 길이(기본 6시간)
    """
    user_date = pd.to_datetime(user_date_str, format='%Y/%m/%d %H:%M', errors='coerce')
    if pd.isna(user_date):
        raise ValueError(f"입력한 날짜/시간이 올바르지 않습니다: {user_date_str}")

    lstm_features = ['TA', 'WS', 'HM']
    start_time = user_date - pd.Timedelta(hours=seq_length)
    recent_data = df.loc[start_time:user_date].copy()

    recent_data = recent_data.infer_objects(copy=False)
    recent_data.interpolate(method='time', inplace=True)

    if any(col not in recent_data.columns for col in lstm_features):
        raise ValueError("필요한 컬럼(TA, WS, HM)이 데이터에 존재하지 않습니다.")

    # (seq_length, feature) 모양의 numpy 배열로 구성
    input_seq = recent_data[lstm_features].values[-seq_length:]

    prediction_result = []
    for hour_ahead in range(horizon):
        X_input = np.array([input_seq])
        ta_pred = model.predict(X_input)[0, 0]

        pred_time = user_date + pd.Timedelta(hours=hour_ahead)
        if pred_time in df.index:
            ta_actual = df.loc[pred_time, 'TA']
        else:
            ta_actual = np.nan

        prediction_result.append([
            pred_time.strftime('%Y-%m-%d %H:%M'),
            ta_pred,
            ta_actual
        ])

        new_row = input_seq[-1].copy()
        new_row[0] = ta_pred
        input_seq = np.vstack([input_seq[1:], new_row])

    result_df = pd.DataFrame(prediction_result, columns=['DateTime', 'Predicted_TA', 'Actual_TA'])
    result_df['Absolute_Error'] = (result_df['Predicted_TA'] - result_df['Actual_TA']).abs()

    return result_df

if __name__ == '__main__':
    # 미리 학습된 모델 로드
    rf_model = joblib.load('random_forest_model.pkl')  # (필요하다면)
    lstm_model = load_model('lstm_model.keras')

    # 2024년 데이터 불러오기
    test_df_2024 = load_and_preprocess_single('2024_weather.xlsx')

    print("날짜와 시간을 입력하세요. (예: 2024/03/12 00:00)")
    print("종료하려면 q 를 입력하세요.")

    while True:
        user_input_time = input("Input date/time (YYYY/MM/DD HH:MM): ")

        # 'q' 입력 시 반복 종료
        if user_input_time.lower() == 'q':
            print("종료합니다.")
            break

        try:
            result_6h = predict_and_compare_6hours(
                test_df_2024,
                lstm_model,
                user_input_time,
                seq_length=24,
                horizon=6
            )
            print("\n=== 6시간 예측 결과 비교 ===")
            print(result_6h)

            # --- (추가) MSE, MAE, R^2 계산 ---
            # 실제 관측값이 NaN이 아닐 때만 계산
            df_valid = result_6h.dropna(subset=['Actual_TA'])
            if len(df_valid) > 0:
                y_true = df_valid['Actual_TA'].values
                y_pred = df_valid['Predicted_TA'].values

                mse_val = mean_squared_error(y_true, y_pred)
                mae_val = mean_absolute_error(y_true, y_pred)
                r2_val = r2_score(y_true, y_pred)

                print("\n[평가지표]")
                print(f"MSE: {mse_val:.4f}")
                print(f"MAE: {mae_val:.4f}")
                print(f"R^2 : {r2_val:.4f}")
            else:
                print("\n실제 관측값이 없어 평가지표를 계산할 수 없습니다.")

        except Exception as e:
            print(f"오류: {e}")

        print("------------------------------------------------------")
