<a href="https://colab.research.google.com/github/UnpackJungHo/XRSimulator_Osaka/blob/Learning_AI/LSTM_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# (필요하다면) joblib 등을 통해 RandomForest 모델 저장
import joblib

# ---------------------------------------------
# 1. 여러 파일(2014~2023) 불러와서 전처리 후 병합
# ---------------------------------------------
def load_and_preprocess_data_multiple(folder_path='.', start_year=2014, end_year=2023):
    all_dfs = []
    for year in range(start_year, end_year + 1):
        file_name = f"{year}_weather.xlsx"
        file_path = os.path.join(folder_path, file_name)
        if os.path.exists(file_path):
            df_temp = pd.read_excel(file_path)
            # DateTime 컬럼 시계열 변환
            df_temp['DateTime'] = pd.to_datetime(df_temp['DateTime(YYYYMMDDHHMI)'], format='%Y%m%d%H%M')
            df_temp.set_index('DateTime', inplace=True)

            # -9를 결측치로 처리 후 보간
            df_temp.replace(-9, np.nan, inplace=True)
            df_temp = df_temp.infer_objects(copy=False)  # FutureWarning 방지
            df_temp.interpolate(method='time', inplace=True)

            # 필요 없는 컬럼 제거
            if 'STN' in df_temp.columns:
                df_temp.drop(columns=['STN'], inplace=True)

            all_dfs.append(df_temp)
        else:
            print(f"File not found: {file_path}")

    if len(all_dfs) == 0:
        raise ValueError("No data files found in the specified range.")

    df_merged = pd.concat(all_dfs)
    df_merged.sort_index(inplace=True)
    return df_merged

# ---------------------------------------------
# 2. 랜덤 포레스트 학습 (예: TA_next1 예측)
# ---------------------------------------------
def train_random_forest(df):
    # 1시간 뒤 온도를 타깃으로 하는 컬럼
    df['TA_next1'] = df['TA'].shift(-1)
    df_rf = df.dropna(subset=['TA', 'TA_next1'])

    features = ['TA', 'RN', 'SD_TOT', 'CA_TOT', 'WD', 'WS', 'HM']
    X = df_rf[features]
    y = df_rf['TA_next1']

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, shuffle=False
    )

    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    y_pred_rf = rf_model.predict(X_val)
    mse_rf = mean_squared_error(y_val, y_pred_rf)
    print("[RandomForest] Validation MSE:", mse_rf)

    return rf_model

# ---------------------------------------------
# 3. LSTM 학습을 위한 시계열 데이터셋 구성
# ---------------------------------------------
def create_lstm_dataset(dataset, target, seq_length=24):
    Xs, ys = [], []
    for i in range(len(dataset) - seq_length):
        Xs.append(dataset.iloc[i:i+seq_length].values)
        ys.append(target.iloc[i+seq_length])
    return np.array(Xs), np.array(ys)

# ---------------------------------------------
# 4. LSTM 모델 학습
# ---------------------------------------------
def train_lstm(df, seq_length=24):
    lstm_features = ['TA', 'WS', 'HM']
    df_lstm = df.dropna(subset=lstm_features)

    X_lstm, y_lstm = create_lstm_dataset(
        df_lstm[lstm_features],
        df_lstm['TA'],
        seq_length
    )

    split_idx = int(len(X_lstm) * 0.8)
    X_train, X_val = X_lstm[:split_idx], X_lstm[split_idx:]
    y_train, y_val = y_lstm[:split_idx], y_lstm[split_idx:]

    model = Sequential()
    model.add(LSTM(64, input_shape=(seq_length, len(lstm_features)), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse')

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stop],
        verbose=1
    )

    y_pred_val = model.predict(X_val)
    mse_lstm = mean_squared_error(y_val, y_pred_val)
    print("[LSTM] Validation MSE:", mse_lstm)

    return model

# ---------------------------------------------
# 5. 학습 실행 (메인)
# ---------------------------------------------
if __name__ == '__main__':
    # 2014~2023년 학습용 데이터 불러오기
    train_df = load_and_preprocess_data_multiple(folder_path='.', start_year=2014, end_year=2023)

    # 랜덤 포레스트 학습
    rf_model = train_random_forest(train_df)

    # LSTM 학습
    lstm_model = train_lstm(train_df, seq_length=24)

    # 학습 완료 모델 저장
    # (1) 랜덤 포레스트 저장
    joblib.dump(rf_model, 'random_forest_model.pkl')
    # (2) LSTM 저장 (h5 파일 등)
    lstm_model.save('lstm_model.h5')

    print("Model training complete and saved.")


  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df_temp.interpolate(method='time', inplace=True)
  df.interpolate(method='time', inplace=True)


[RandomForest] Validation MSE: 0.8557719662493466


  super().__init__(**kwargs)


Epoch 1/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 8ms/step - loss: 54.5089 - val_loss: 1.6446
Epoch 2/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 4.0651 - val_loss: 0.6474
Epoch 3/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 8ms/step - loss: 3.6810 - val_loss: 0.4737
Epoch 4/50
[1m2194/2194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - loss: 3.8827 - val_loss: 0.4685
Epoch 5/50


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_squared_error
import joblib

import tensorflow as tf
from tensorflow.keras.models import load_model

# 여기서도 create_lstm_dataset 등 몇 개의 함수가 필요하므로
# 동일 함수 코드를 복사하거나, 별도의 common 모듈로 만들어 import할 수 있다.
# 여기서는 간단히 복사/붙여넣기 예시.

def create_lstm_dataset(dataset, target, seq_length=24):
    Xs, ys = [], []
    for i in range(len(dataset) - seq_length):
        Xs.append(dataset.iloc[i:i+seq_length].values)
        ys.append(target.iloc[i+seq_length])
    return np.array(Xs), np.array(ys)

# ---------------------------------------------
# 테스트용 데이터(2024) 불러오기
# ---------------------------------------------
def load_and_preprocess_single(file_path='2024_weather.xlsx'):
    df = pd.read_excel(file_path)
    df['DateTime'] = pd.to_datetime(df['DateTime(YYYYMMDDHHMI)'], format='%Y%m%d%H%M')
    df.set_index('DateTime', inplace=True)

    df.replace(-9, np.nan, inplace=True)
    df = df.infer_objects(copy=False)  # FutureWarning 방지
    df.interpolate(method='time', inplace=True)

    if 'STN' in df.columns:
        df.drop(columns=['STN'], inplace=True)

    df.sort_index(inplace=True)
    return df

# ---------------------------------------------
# 테스트(2024) 데이터에 대한 예측 및 실제 비교
# ---------------------------------------------
def evaluate_on_test_data(df_test, lstm_model, seq_length=24):
    # LSTM과 동일한 피처
    lstm_features = ['TA', 'WS', 'HM']
    df_test_lstm = df_test.dropna(subset=lstm_features).copy()

    X_test, y_test = create_lstm_dataset(
        df_test_lstm[lstm_features],
        df_test_lstm['TA'],
        seq_length
    )

    if len(X_test) == 0:
        raise ValueError("Not enough data in test set for the given seq_length.")

    y_pred = lstm_model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred)
    print("[LSTM] Test MSE on 2024:", mse_test)

    y_test_index = df_test_lstm.index[seq_length:]
    compare_df = pd.DataFrame({
        'Actual_TA': y_test,
        'Predicted_TA': y_pred.flatten()
    }, index=y_test_index)

    return compare_df, mse_test

# ---------------------------------------------
# 특정 날짜로 24시간 예측
# ---------------------------------------------
def predict_for_date(df, model, target_date_str='2025/02/23', seq_length=24):
    target_date = pd.to_datetime(target_date_str)
    lstm_features = ['TA', 'WS', 'HM']

    start_time = target_date - pd.Timedelta(hours=seq_length)
    recent_data = df.loc[start_time:target_date].copy()

    recent_data = recent_data.infer_objects(copy=False)
    recent_data.interpolate(method='time', inplace=True)

    input_seq = recent_data[lstm_features].values[-seq_length:]

    prediction_result = []
    for hour in range(24):
        X_input = np.array([input_seq])
        ta_pred = model.predict(X_input)[0, 0]

        pred_time = target_date + pd.Timedelta(hours=hour)
        prediction_result.append([
            pred_time.strftime('%Y%m%d%H%M'),
            108,   # STN(가정)
            ta_pred,
            -9, -9, -9, -9, -9, -9
        ])

        new_row = input_seq[-1].copy()
        new_row[0] = ta_pred  # TA
        input_seq = np.vstack([input_seq[1:], new_row])

    columns = [
        'DateTime(YYYYMMDDHHMI)',
        'STN',
        'TA',
        'RN',
        'SD_TOT',
        'CA_TOT',
        'WD',
        'WS',
        'HM'
    ]
    result_df = pd.DataFrame(prediction_result, columns=columns)
    return result_df

# ---------------------------------------------
# 메인 실행: 모델 불러오기 -> 평가 & 예측
# ---------------------------------------------
if __name__ == '__main__':
    # 1) 저장된 모델 불러오기
    rf_model = joblib.load('random_forest_model.pkl')  # (필요하다면)
    lstm_model = load_model('lstm_model.h5')

    # 2) 2024년 데이터 불러오기
    test_df_2024 = load_and_preprocess_single('2024_weather.xlsx')

    # 3) LSTM 예측 vs 실제 비교
    compare_df_2024, mse_2024 = evaluate_on_test_data(test_df_2024, lstm_model, seq_length=24)
    print(compare_df_2024.head(10))  # 일부 결과만 미리보기

    # 4) (옵션) 특정 날짜 24시간 예측
    pred_sample = predict_for_date(test_df_2024, lstm_model, '2024/01/05', seq_length=24)
    print(pred_sample)
