# 기본 세팅

In [1]:
import os
from pathlib import Path
from google.colab import auth
auth.authenticate_user()

from google.colab import drive

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
market_data = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/lstm-rnn/market_data.csv', skiprows=1, header=None, encoding='UTF-8')
market_data.columns = ['date', 'kospi', 'oil_price', 'interest_rate', 'price_index', 'cny_krw', 'usd_krw', 'jpy_krw']

Unnamed: 0,date,kospi,oil_price,interest_rate,price_index,cny_krw,usd_krw,jpy_krw
0,2020-01-01,2119.01,1568.44,1.25,100.09,168.01,1164.28,1065.25
1,2020-02-01,1987.01,1545.29,1.25,100.16,170.48,1193.79,1084.95
2,2020-03-01,1754.64,1469.06,0.75,99.94,173.78,1220.09,1134.75
3,2020-04-01,1947.56,1323.66,0.75,99.5,172.92,1225.23,1135.31
4,2020-05-01,2029.6,1255.08,0.5,99.44,172.52,1228.67,1145.65


In [4]:
etf_data = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/lstm-rnn/ETF_Monthly_price.csv', skiprows=1, header=None, encoding='UTF-8')
etf_data.columns = ['id', 'company', 'name', 'date', 'open', 'close', 'return', 'volume']
etf_data['return'] = pd.to_numeric(etf_data['return'], errors='coerce')

Unnamed: 0,id,company,name,date,open,close,return,volume
0,001_20250101,삼성자산운용,KODEX 200,2025-01-01,32060,32530,1.47,143700267
1,001_20250201,삼성자산운용,KODEX 200,2025-02-01,32530,33695,3.58,166118306
2,001_20250301,삼성자산운용,KODEX 200,2025-03-01,33695,34350,1.94,145686571
3,001_20250401,삼성자산운용,KODEX 200,2025-04-01,34350,33965,-1.12,220223347
4,001_20250501,삼성자산운용,KODEX 200,2025-05-01,33965,36125,6.36,144752275


In [22]:
fund_data = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/lstm-rnn/pension_fund_monthly_data.csv', skiprows=1, header=None, encoding='UTF-8')
fund_data.columns = ['fund_code', 'date', 'open', 'close', 'return']
fund_data['return'] = pd.to_numeric(fund_data['return'], errors='coerce')

Unnamed: 0,fund_code,date,open,close,return
0,F0001,2020-10-01,1083.72,1083.72,0.0
1,F0001,2020-11-01,1071.1,1071.1,-1.16
2,F0001,2020-12-01,1100.72,1100.72,2.77
3,F0001,2021-01-01,1085.92,1085.92,-1.34
4,F0001,2021-02-01,1078.26,1078.26,-0.71


# 딥러닝 모델 구축 및 평가
Analyze the provided ETF data ("ETF_Monthly_price.csv") and market data to predict ETF returns using a combination of deep learning (LSTM) and time series (Prophet) models, and evaluate the performance of individual and ensemble models.

## 데이터 준비 및 전처리 (딥러닝 모델용)

### Subtask:
딥러닝 모델 학습을 위해 ETF 데이터를 시퀀스 형태로 변환하고, 특성 스케일링 등 필요한 전처리를 수행합니다. 시장 지표 데이터를 통합하는 방법을 고려합니다.


**Reasoning**:
Merge the ETF data and market data on the date column, select the relevant columns for the model, handle missing values, and display the first few rows and the columns of the merged dataframe.



In [27]:
etf_data['date'] = pd.to_datetime(etf_data['date'])
market_data['date'] = pd.to_datetime(market_data['date'])

# Merge ETF and market data on the 'date' column
merged_data = pd.merge(etf_data, market_data, on='date', how='inner')

# Select relevant columns for the model. Include 'return' from etf_data and market features.
# Exclude redundant 'date' and 'id', 'company', 'name', 'open', 'close', 'volume' from etf_data
# and 'date' from market_data (already used for merging)
model_data = merged_data[['date', 'return', 'kospi', 'oil_price', 'interest_rate', 'price_index', 'cny_krw', 'usd_krw', 'jpy_krw']].dropna()

display(model_data.head())
display(model_data.columns)

Unnamed: 0,date,return,kospi,oil_price,interest_rate,price_index,cny_krw,usd_krw,jpy_krw
0,2025-01-01,1.47,2517.37,1709.29,3.0,115.71,198.71,1455.79,927.97
1,2025-02-01,3.58,2532.78,1728.26,2.75,116.08,198.43,1445.56,952.59
2,2025-03-01,1.94,2481.12,1688.93,2.75,116.29,200.79,1456.95,977.77
3,2025-04-01,-1.12,2556.61,1646.69,2.75,116.38,197.7,1444.31,999.96
4,2025-05-01,6.36,2697.67,1636.45,2.5,116.27,193.37,1394.49,962.28


Index(['date', 'return', 'kospi', 'oil_price', 'interest_rate', 'price_index',
       'cny_krw', 'usd_krw', 'jpy_krw'],
      dtype='object')

**Reasoning**:
Define a function to create time series sequences for the LSTM model, scale the features and the target variable, and apply the sequence creation function to the scaled data.



In [28]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def create_sequences(data, sequence_length):
    xs, ys = [], []
    for i in range(len(data) - sequence_length):
        x = data.iloc[i:(i + sequence_length)].values
        y = data.iloc[i + sequence_length]['return'] # Assuming 'return' is the last column or easily accessible
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Separate features and target
features = model_data[['kospi', 'oil_price', 'interest_rate', 'price_index', 'cny_krw', 'usd_krw', 'jpy_krw']]
target = model_data[['return']]

# Scale features
feature_scaler = MinMaxScaler()
scaled_features = feature_scaler.fit_transform(features)

# Scale target
target_scaler = MinMaxScaler()
scaled_target = target_scaler.fit_transform(target)

# Combine scaled features and target for sequence creation
scaled_data = pd.DataFrame(scaled_features, columns=features.columns)
scaled_data['return'] = scaled_target

# Create sequences
sequence_length = 12 # Define the sequence length for LSTM
X_sequences, y_sequences = create_sequences(scaled_data, sequence_length)

print("Shape of X_sequences:", X_sequences.shape)
print("Shape of y_sequences:", y_sequences.shape)

Shape of X_sequences: (7712, 12, 8)
Shape of y_sequences: (7712,)


## Lstm 모델 구축 및 학습

### Subtask:
LSTM 모델 아키텍처를 정의하고, 전처리된 데이터를 사용하여 모델을 학습시킵니다.


**Reasoning**:
Import necessary libraries for building and training an LSTM model using TensorFlow or Keras and define the LSTM model architecture, compile the model, split the data into training and testing sets, and train the model.



In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Define the LSTM model architecture
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_sequences.shape[1], X_sequences.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=25))
model.add(Dense(units=1)) # Output layer for regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# Train the LSTM model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Store the trained model (optional, but good practice)
# model.save('lstm_model.h5')

print("LSTM model training completed.")

  super().__init__(**kwargs)


Epoch 1/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 0.0259 - val_loss: 0.0078
Epoch 2/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - loss: 0.0087 - val_loss: 0.0081
Epoch 3/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - loss: 0.0085 - val_loss: 0.0077
Epoch 4/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - loss: 0.0079 - val_loss: 0.0078
Epoch 5/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 0.0079 - val_loss: 0.0077
Epoch 6/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - loss: 0.0076 - val_loss: 0.0081
Epoch 7/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 0.0078 - val_loss: 0.0074
Epoch 8/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 0.0077 - val_loss: 0.0074
Epoch 9/50
[1m193/193[0m [32m

**Reasoning**:
Load the data from the CSV file into a pandas DataFrame and display the first few rows to inspect the data.



**Reasoning**:
The previous command failed because the file was not found. I will try reading the file from a different location, assuming the file is located in the current working directory.



In [33]:
display(etf_data.head())

Unnamed: 0,id,company,name,date,open,close,return,volume
0,001_20250101,삼성자산운용,KODEX 200,2025-01-01,32060,32530,1.47,143700267
1,001_20250201,삼성자산운용,KODEX 200,2025-02-01,32530,33695,3.58,166118306
2,001_20250301,삼성자산운용,KODEX 200,2025-03-01,33695,34350,1.94,145686571
3,001_20250401,삼성자산운용,KODEX 200,2025-04-01,34350,33965,-1.12,220223347
4,001_20250501,삼성자산운용,KODEX 200,2025-05-01,33965,36125,6.36,144752275


## 데이터 준비 및 전처리 (딥러닝 모델용)

### Subtask:
딥러닝 모델 학습을 위해 ETF 데이터를 시퀀스 형태로 변환하고, 특성 스케일링 등 필요한 전처리를 수행합니다. 시장 지표 데이터를 통합하는 방법을 고려합니다.

**Reasoning**:
Merge the ETF data and market data on the date column, select the relevant columns for the model, handle missing values, and display the first few rows and the columns of the merged dataframe.

In [34]:
etf_data['date'] = pd.to_datetime(etf_data['date'])
market_data['date'] = pd.to_datetime(market_data['date'])

# Merge ETF and market data on the 'date' column
merged_data = pd.merge(etf_data, market_data, on='date', how='inner')

# Select relevant columns for the model. Include 'return' from etf_data and market features.
# Exclude redundant 'date' and 'id', 'company', 'name', 'open', 'close', 'volume' from etf_data
# and 'date' from market_data (already used for merging)
model_data = merged_data[['date', 'return', 'kospi', 'oil_price', 'interest_rate', 'price_index', 'cny_krw', 'usd_krw', 'jpy_krw']].dropna()

display(model_data.head())
display(model_data.columns)

Unnamed: 0,date,return,kospi,oil_price,interest_rate,price_index,cny_krw,usd_krw,jpy_krw
0,2025-01-01,1.47,2517.37,1709.29,3.0,115.71,198.71,1455.79,927.97
1,2025-02-01,3.58,2532.78,1728.26,2.75,116.08,198.43,1445.56,952.59
2,2025-03-01,1.94,2481.12,1688.93,2.75,116.29,200.79,1456.95,977.77
3,2025-04-01,-1.12,2556.61,1646.69,2.75,116.38,197.7,1444.31,999.96
4,2025-05-01,6.36,2697.67,1636.45,2.5,116.27,193.37,1394.49,962.28


Index(['date', 'return', 'kospi', 'oil_price', 'interest_rate', 'price_index',
       'cny_krw', 'usd_krw', 'jpy_krw'],
      dtype='object')

**Reasoning**:
Define a function to create time series sequences for the LSTM model, scale the features and the target variable, and apply the sequence creation function to the scaled data.

In [35]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def create_sequences(data, sequence_length):
    xs, ys = [], []
    for i in range(len(data) - sequence_length):
        x = data.iloc[i:(i + sequence_length)].values
        y = data.iloc[i + sequence_length]['return'] # Assuming 'return' is the last column or easily accessible
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Separate features and target
features = model_data[['kospi', 'oil_price', 'interest_rate', 'price_index', 'cny_krw', 'usd_krw', 'jpy_krw']]
target = model_data[['return']]

# Scale features
feature_scaler = MinMaxScaler()
scaled_features = feature_scaler.fit_transform(features)

# Scale target
target_scaler = MinMaxScaler()
scaled_target = target_scaler.fit_transform(target)

# Combine scaled features and target for sequence creation
scaled_data = pd.DataFrame(scaled_features, columns=features.columns)
scaled_data['return'] = scaled_target

# Create sequences
sequence_length = 12 # Define the sequence length for LSTM
X_sequences, y_sequences = create_sequences(scaled_data, sequence_length)

print("Shape of X_sequences:", X_sequences.shape)
print("Shape of y_sequences:", y_sequences.shape)

Shape of X_sequences: (7712, 12, 8)
Shape of y_sequences: (7712,)


## Lstm 모델 구축 및 학습

### Subtask:
LSTM 모델 아키텍처를 정의하고, 전처리된 데이터를 사용하여 모델을 학습시킵니다.

**Reasoning**:
Import necessary libraries for building and training an LSTM model using TensorFlow or Keras and define the LSTM model architecture, compile the model, split the data into training and testing sets, and train the model.

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Define the LSTM model architecture
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_sequences.shape[1], X_sequences.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=25))
model.add(Dense(units=1)) # Output layer for regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# Train the LSTM model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Store the trained model (optional, but good practice)
# model.save('lstm_model.h5')

print("LSTM model training completed.")

Epoch 1/50


  super().__init__(**kwargs)


[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - loss: 0.0261 - val_loss: 0.0076
Epoch 2/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.0083 - val_loss: 0.0077
Epoch 3/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - loss: 0.0081 - val_loss: 0.0075
Epoch 4/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - loss: 0.0080 - val_loss: 0.0075
Epoch 5/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 0.0082 - val_loss: 0.0075
Epoch 6/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.0070 - val_loss: 0.0076
Epoch 7/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.0078 - val_loss: 0.0078
Epoch 8/50
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - loss: 0.0078 - val_loss: 0.0075
Epoch 9/50
[1m193/193[0m [32m━━━━━━━━━━━

## 모델 평가 (LSTM 및 Prophet)

### Subtask:
학습된 LSTM 모델과 Prophet 모델의 예측 성능을 각각 평가합니다. (예: MSE, RMSE, MAE 등)

**Reasoning**:
Make predictions using the trained LSTM and Prophet models and evaluate their performance using common regression metrics.

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Evaluate LSTM model
lstm_predictions = model.predict(X_test)
lstm_predictions = target_scaler.inverse_transform(lstm_predictions) # Inverse transform to original scale
y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)) # Inverse transform test target

lstm_mse = mean_squared_error(y_test_original, lstm_predictions)
lstm_rmse = np.sqrt(lstm_mse)
lstm_mae = mean_absolute_error(y_test_original, lstm_predictions)

print("LSTM Model Evaluation:")
print(f"  MSE: {lstm_mse:.4f}")
print(f"  RMSE: {lstm_rmse:.4f}")
print(f"  MAE: {lstm_mae:.4f}")

# Evaluate Prophet model
# Create future dataframe for Prophet prediction
future = prophet_model.make_future_dataframe(periods=len(y_test), freq='MS') # Assuming monthly data frequency

# Make predictions
prophet_forecast = prophet_model.predict(future)

# Align Prophet predictions with the LSTM test data dates
# Find the dates in the test set
test_dates = model_data.iloc[len(model_data) - len(y_test) - sequence_length:].iloc[sequence_length:]['date']

# Filter prophet_forecast to include only the test dates
prophet_predictions_df = prophet_forecast[prophet_forecast['ds'].isin(test_dates)]
prophet_predictions = prophet_predictions_df['yhat'].values

# Ensure the order of prophet_predictions matches y_test_original
prophet_predictions_df = prophet_predictions_df.set_index('ds').loc[test_dates].reset_index()
prophet_predictions = prophet_predictions_df['yhat'].values


prophet_mse = mean_squared_error(y_test_original, prophet_predictions)
prophet_rmse = np.sqrt(prophet_mse)
prophet_mae = mean_absolute_error(y_test_original, prophet_predictions)

print("\nProphet Model Evaluation:")
print(f"  MSE: {prophet_mse:.4f}")
print(f"  RMSE: {prophet_rmse:.4f}")
print(f"  MAE: {prophet_mae:.4f}")

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
LSTM Model Evaluation:
  MSE: 36.2432
  RMSE: 6.0202
  MAE: 4.4072

Prophet Model Evaluation:
  MSE: 50.8392
  RMSE: 7.1302
  MAE: 5.3321


## Prophet 모델 구축 및 학습

### Subtask:
Prophet 모델을 사용하여 ETF 수익률 시계열 데이터를 학습시킵니다.

In [37]:
# Install Prophet
!pip install prophet



**Reasoning**:
Import the Prophet library, prepare the data in the required format ('ds' for date, 'y' for target), and train the Prophet model.

In [38]:
from prophet import Prophet

# Prepare data for Prophet (Prophet requires columns named 'ds' and 'y')
prophet_data = model_data[['date', 'return']].rename(columns={'date': 'ds', 'return': 'y'})

# Initialize and fit the Prophet model
prophet_model = Prophet()
prophet_model.fit(prophet_data)

print("Prophet model training completed.")

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2x3pa6h9/o3i8cs6o.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2x3pa6h9/3nbasyi5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=70622', 'data', 'file=/tmp/tmp2x3pa6h9/o3i8cs6o.json', 'init=/tmp/tmp2x3pa6h9/3nbasyi5.json', 'output', 'file=/tmp/tmp2x3pa6h9/prophet_modelbfzx4nst/prophet_model-20250912063114.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
06:31:14 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
06:31:15 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


Prophet model training completed.


## 앙상블 전략 구현

### Subtask:
LSTM 모델과 Prophet 모델의 예측 결과를 결합하는 앙상블 전략을 구현합니다. (예: 가중 평균, 스태킹 등)

**Reasoning**:
Implement a simple ensemble strategy by taking a weighted average of the predictions from the LSTM and Prophet models. The weights can be adjusted later based on performance.

In [40]:
# Implement a simple weighted average ensemble
# You can adjust the weights based on model performance or other criteria
lstm_weight = 0.6  # Example weight for LSTM
prophet_weight = 0.4 # Example weight for Prophet

# Ensure prophet_predictions has the same shape as lstm_predictions
# This might require careful alignment of dates and handling of missing values if any
# For this example, we assume they are already aligned based on the evaluation step

ensemble_predictions = (lstm_weight * lstm_predictions) + (prophet_weight * prophet_predictions.reshape(-1, 1))

print("Ensemble predictions generated.")

Ensemble predictions generated.


## 앙상블 모델 평가

### Subtask:
앙상블 모델의 예측 성능을 평가하고, 개별 모델의 성능과 비교합니다.

**Reasoning**:
Evaluate the performance of the ensemble model using the same metrics as the individual models and compare the results.

In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Evaluate Ensemble model
ensemble_mse = mean_squared_error(y_test_original, ensemble_predictions)
ensemble_rmse = np.sqrt(ensemble_mse)
ensemble_mae = mean_absolute_error(y_test_original, ensemble_predictions)

print("Ensemble Model Evaluation:")
print(f"  MSE: {ensemble_mse:.4f}")
print(f"  RMSE: {ensemble_rmse:.4f}")
print(f"  MAE: {ensemble_mae:.4f}")

print("\n--- Performance Comparison ---")
print("Model         | MSE      | RMSE     | MAE")
print("--------------|----------|----------|----------")
print(f"LSTM          | {lstm_mse:.4f} | {lstm_rmse:.4f} | {lstm_mae:.4f}")
print(f"Prophet       | {prophet_mse:.4f} | {prophet_rmse:.4f} | {prophet_mae:.4f}")
print(f"Ensemble      | {ensemble_mse:.4f} | {ensemble_rmse:.4f} | {ensemble_mae:.4f}")

# Determine the best performing model based on MSE
best_model = "Ensemble"
if lstm_mse < ensemble_mse and lstm_mse < prophet_mse:
    best_model = "LSTM"
elif prophet_mse < ensemble_mse and prophet_mse < lstm_mse:
    best_model = "Prophet"

print(f"\nBased on MSE, the best performing model is: {best_model}")

Ensemble Model Evaluation:
  MSE: 39.3808
  RMSE: 6.2754
  MAE: 4.5862

--- Performance Comparison ---
Model         | MSE      | RMSE     | MAE
--------------|----------|----------|----------
LSTM          | 36.2432 | 6.0202 | 4.4072
Prophet       | 50.8392 | 7.1302 | 5.3321
Ensemble      | 39.3808 | 6.2754 | 4.5862

Based on MSE, the best performing model is: LSTM


## 분석 결과 요약

### 데이터 분석 주요 결과

*   ETF 수익률과 시장 지표 간의 상관관계를 분석하여 회귀 모델 특성 후보를 선정했습니다. (KOSPI, oil\_price, price\_index, cny\_krw 등이 0.04 이상의 절댓값 상관관계를 보였습니다.)
*   딥러닝 모델 학습을 위해 ETF 데이터와 시장 지표 데이터를 병합하고, 시퀀스 형태로 변환 및 스케일링 전처리를 수행했습니다.
*   개별 모델로 LSTM과 Prophet 모델을 구축하고 학습시켰습니다.
*   두 모델의 예측 결과를 가중 평균하는 간단한 앙상블 전략을 구현했습니다.
*   각 모델(LSTM, Prophet, 앙상블)의 예측 성능을 MSE, RMSE, MAE 지표를 사용하여 평가했습니다.

### 모델 성능 비교 결과

| 모델    | MSE      | RMSE     | MAE      |
| :------ | :------- | :------- | :------- |
| LSTM    | {{lstm`_mse:.4f}} | {{lstm_rmse:.4f}} | {{lstm_mae:.4f}} |
| Prophet | {{prophet_mse:.4f}} | {{prophet_rmse:.4f}} | {{prophet_mae:.4f}} |
| 앙상블  | {{ensemble_mse:.4f}} | {{ensemble_rmse:.4f}} | {{ensemble_mae:.4f}} |

평가 결과, MSE 기준으로 **LSTM 모델이 가장 좋은 성능**을 보였습니다. 앙상블 모델은 Prophet 모델보다는 성능이 좋았지만, LSTM 단일 모델보다는 성능이 낮은 것으로 나타났습니다.

### 결론 및 향후 개선 방향

*   현재 데이터와 모델 설정에서는 LSTM 단일 모델이 가장 효과적인 것으로 분석되었습니다.
*   앙상블 모델의 성능이 LSTM 단일 모델보다 낮은 것은 가중치 설정이 최적화되지 않았거나, 사용된 앙상블 방법(단순 가중 평균)이 데이터의 특성을 충분히 반영하지 못했기 때문일 수 있습니다.
*   **향후 개선 방향:**
    *   **앙상블 방법 개선:** 스태킹(Stacking)과 같은 다른 앙상블 기법을 시도하거나, 모델별 성능에 기반한 동적인 가중치 조정 방법을 고려해 볼 수 있습니다.
    *   **모델 하이퍼파라미터 튜닝:** LSTM 및 Prophet 모델의 하이퍼파라미터를 최적화하여 개별 모델의 성능을 향상시킬 수 있습니다.
    *   **특성 엔지니어링 강화:** 이동 평균, 변동성 지표 등 시계열 분석에 유용한 추가적인 특성을 생성하여 모델 성능을 개선할 수 있습니다.
    *   **다른 딥러닝 모델 탐색:** GRU(Gated Recurrent Unit)와 같은 다른 순환 신경망 모델이나 트랜스포머(Transformer) 모델 적용을 고려해 볼 수 있습니다.

본 분석은 ETF 수익률 예측을 위한 딥러닝 및 시계열 모델 적용의 초기 탐색 단계이며, 위에서 제시된 개선 방향들을 통해 모델의 예측 정확도를 더욱 높일 수 있을 것으로 기대됩니다.