In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Load the datasets
stock_data_path = '/content/drivedata/dow_1day_price.csv'
etf_data_path = '/content/drivedata/etf_1day_price.csv'

# Read the data
stock_data = pd.read_csv(stock_data_path)
etf_data = pd.read_csv(etf_data_path)

# Display the first few rows of each dataset for exploration
stock_data_head = stock_data.head()
etf_data_head = etf_data.head()

# Basic information about the datasets
stock_data_info = stock_data.info()
etf_data_info = etf_data.info()

# Ensure both datasets have proper datetime columns for alignment
if 'Date' in stock_data.columns:
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    stock_data.set_index('Date', inplace=True)
if 'Date' in etf_data.columns:
    etf_data['Date'] = pd.to_datetime(etf_data['Date'])
    etf_data.set_index('Date', inplace=True)

# Summarize key statistics
stock_summary = stock_data.describe()
etf_summary = etf_data.describe()

# Align the two datasets by date
merged_data = pd.merge(stock_data, etf_data, left_index=True, right_index=True, suffixes=('_stock', '_etf'))

# Display basic statistics and aligned data
stock_data_head, etf_data_head, stock_summary, etf_summary, merged_data.head()


In [None]:
import matplotlib.pyplot as plt

# Select a subset of columns to plot for clearer visualization
selected_stock_columns = stock_data.columns[:5]  # Select the first 5 stock columns
selected_etf_columns = etf_data.columns[:5]  # Select the first 5 ETF columns

# Plot stock price trends
plt.figure(figsize=(14, 7))
for col in selected_stock_columns:
    plt.plot(stock_data.index, stock_data[col], label=f'Stock: {col}')
plt.title('Stock Price Trends')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()

# Plot ETF price trends
plt.figure(figsize=(14, 7))
for col in selected_etf_columns:
    plt.plot(etf_data.index, etf_data[col], label=f'ETF: {col}')
plt.title('ETF Price Trends')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()


In [None]:
#趋势分析（几乎无相关性，)
# Select specific stock and ETF columns for comparison
specific_stock = stock_data.columns[0]  # First stock column
specific_etf = etf_data.columns[0]      # First ETF column

# Plot the specific stock and ETF trends
plt.figure(figsize=(14, 7))
plt.plot(stock_data.index, stock_data[specific_stock], label=f'Stock: {specific_stock}')
plt.plot(etf_data.index, etf_data[specific_etf], label=f'ETF: {specific_etf}')
plt.title(f'Comparison of Stock ({specific_stock}) and ETF ({specific_etf}) Trends')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()
# Calculate the correlation between the specific stock and ETF
correlation = stock_data[specific_stock].corr(etf_data[specific_etf])

correlation


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU
import numpy as np

# Prepare data for modeling
# Use the specific stock as target and specific ETF as feature
data = pd.DataFrame({
    'ETF': etf_data[specific_etf],
    'Stock': stock_data[specific_stock]
}).dropna()

# Normalize data
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Create features and labels for time-series forecasting (use ETF to predict Stock)
def create_sequences(data, time_steps=10):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps, 0])  # ETF as feature
        y.append(data[i + time_steps, 1])   # Stock as target
    return np.array(X), np.array(y)

time_steps = 10
X, y = create_sequences(data_scaled, time_steps)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Reshape for LSTM/GRU input
X_train_lstm = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_lstm = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

### 1. SVR Model
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)
y_pred_svr = svr_model.predict(X_test.reshape(X_test.shape[0], -1))

### 2. LSTM Model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    Dense(1)
])
lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=0)
y_pred_lstm = lstm_model.predict(X_test_lstm)

### 3. GRU Model
gru_model = Sequential([
    GRU(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    Dense(1)
])
gru_model.compile(optimizer='adam', loss='mse')
gru_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=0)
y_pred_gru = gru_model.predict(X_test_lstm)

# Rescale predictions back to original scale
y_test_rescaled = scaler.inverse_transform(np.column_stack([np.zeros_like(y_test), y_test]))[:, 1]
y_pred_svr_rescaled = scaler.inverse_transform(np.column_stack([np.zeros_like(y_pred_svr), y_pred_svr]))[:, 1]
y_pred_lstm_rescaled = scaler.inverse_transform(np.column_stack([np.zeros_like(y_pred_lstm.flatten()), y_pred_lstm.flatten()]))[:, 1]
y_pred_gru_rescaled = scaler.inverse_transform(np.column_stack([np.zeros_like(y_pred_gru.flatten()), y_pred_gru.flatten()]))[:, 1]

# Evaluate models
results = {
    "SVR": {
        "MSE": mean_squared_error(y_test_rescaled, y_pred_svr_rescaled),
        "MAE": mean_absolute_error(y_test_rescaled, y_pred_svr_rescaled)
    },
    "LSTM": {
        "MSE": mean_squared_error(y_test_rescaled, y_pred_lstm_rescaled),
        "MAE": mean_absolute_error(y_test_rescaled, y_pred_lstm_rescaled)
    },
    "GRU": {
        "MSE": mean_squared_error(y_test_rescaled, y_pred_gru_rescaled),
        "MAE": mean_absolute_error(y_test_rescaled, y_pred_gru_rescaled)
    }
}

results
