In [78]:
# 0. Import all libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf


In [79]:
# 1. Import data
# Close prices + Other prices + Volumes
file_path = './data/SMCI.csv'
data_price = pd.read_csv(file_path)

# Interest rates
file_path_IR = './data/IR_daily.csv'
data_IR = pd.read_csv(file_path_IR)


In [80]:
# Data CLeaning

# Price data
# Remove dollar signs and convert to float for specified columns
columns_to_convert = ['Close/Last', 'Open', 'High', 'Low']
for column in columns_to_convert:
    data_price[column] = data_price[column].str.replace('$', '').str.replace(',', '').astype(float)



# # Interest Rate Data
# # 处理DPRIME列中的缺失数据
# for idx, value in data_IR['DPRIME'].items():
#     if value == '.':
#         # 寻找上一格和下一格的索引
#         if idx > 0:
#             prev_idx = idx - 1
#         else:
#             prev_idx = idx
#         
#         if idx < len(data_IR) - 1:
#             next_idx = idx + 1
#         else:
#             next_idx = idx
#         
#         # 计算平均值并填充
#         if data_IR.at[prev_idx, 'DPRIME'] != '.' and data_IR.at[next_idx, 'DPRIME'] != '.':
#             avg_value = (float(data_IR.at[prev_idx, 'DPRIME']) + float(data_IR.at[next_idx, 'DPRIME'])) / 2
#             formatted_avg_value = f'{avg_value:.2f}'
#             data_IR.at[idx, 'DPRIME'] = formatted_avg_value


In [81]:
# Prepare the complete dataset, called "merged_data"

# 1. Convert 'Date' columns to datetime
data_price['Date'] = pd.to_datetime(data_price['Date'])
data_IR['Date'] = pd.to_datetime(data_IR['DATE'])  # 确保 'DATE' 列名正确

# # 2. Merge based on 'Date' column
# merged_data = pd.merge(data_price, data_IR[['Date', 'DPRIME']], on='Date', how='inner')
# 
# # 3. Print the merged data to check alignment
# print("Merged Data:")
# print(merged_data.head())
# 
# # 4. Check for any missing data after merge
# missing_data = merged_data[merged_data.isnull().any(axis=1)]
# if not missing_data.empty:
#     print("There are rows with mismatched dates:")
#     print(missing_data)
# else:
#     print("All dates in the data are aligned correctly.")


In [82]:
merged_data = data_price

In [83]:
# Preparing Data
train_set = merged_data[(merged_data['Date'] >= '2014-06-24') & (merged_data['Date'] < '2023-06-24')].copy()
valid_set = merged_data[(merged_data['Date'] >= '2023-06-25') & (merged_data['Date'] < '2024-06-21')].copy()

In [84]:
# 2. Setup Variables
# 2.1 Time horizon set to be 9 years for training, 1 year for validation
# 2.2 time steps for LSTM
time_steps = 90     # tested [30, 60, 90, 120], 90 is the most efficient one

# 2.3 Feature selection
all_features = ['Close/Last', 'Volume', 'Open', 'High', 'Low', 'DPRIME']
selected_features = ['Close/Last', 'Volume']
# selected_features = ['Close/Last']
num_features = len(selected_features)

In [85]:
# 3. Data pre-processing
scaler = MinMaxScaler(feature_range=(0, 1))

# Normalize the 'Close/Last' column for both training and validation sets
train_data = scaler.fit_transform(train_set[selected_features].values)
valid_data = scaler.transform(valid_set[selected_features].values)

# Initialize lists to store training and validation data
# x may contain multiple features, while y is the target value that is being predicted, the close price
x_train, y_train = [], []
for i in range(time_steps, len(train_data)):
    x_train.append(train_data[i-time_steps:i, :])
    y_train.append(train_data[i, 0])  # Only keep the Close/Last column as label

x_valid, y_valid = [], []
for i in range(time_steps, len(valid_data)):
    x_valid.append(valid_data[i-time_steps:i, :])
    y_valid.append(valid_data[i, 0])  # Only keep the Close/Last column as label


# Convert lists to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)
x_valid, y_valid = np.array(x_valid), np.array(y_valid)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], len(selected_features)))
x_valid = np.reshape(x_valid, (x_valid.shape[0], x_valid.shape[1], len(selected_features)))

In [86]:
# 4. Build Model - function
def build_model():
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(time_steps, num_features)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [87]:
# 5. Training - Function
def train_model(model, x_train, y_train):
    model.fit(x_train, y_train, epochs=25, batch_size=32)

In [88]:
# 6. Make Prediction - Function

# @tf.function(reduce_retracing=True)
# def make_prediction(model, x_valid):
#     return model(x_valid, training=False)


def make_prediction(model, x_valid, scaler):
    predictions = model.predict(x_valid)
    predictions = scaler.inverse_transform(predictions)
    return predictions


In [89]:
# Execute step 4,5,6

# model = build_model()
# train_model(model, x_train, y_train)
# 
# # 将验证集数据转换为张量
# x_valid_tensor = tf.convert_to_tensor(x_valid, dtype=tf.float32)
# 
# # 预测
# valid_preds = make_prediction(model, x_valid_tensor)
# 
# # Ensure valid_preds has the correct shape
# valid_preds = np.squeeze(valid_preds, axis=-1)



model = build_model()
train_model(model, x_train, y_train)
valid_preds = make_prediction(model, x_valid, scaler)



Epoch 1/25


  super().__init__(**kwargs)


[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 0.0012
Epoch 2/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - loss: 2.1656e-04
Epoch 3/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - loss: 1.6724e-04
Epoch 4/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - loss: 1.3017e-04
Epoch 5/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - loss: 1.3707e-04
Epoch 6/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - loss: 1.0785e-04
Epoch 7/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 1.3073e-04
Epoch 8/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 1.3399e-04
Epoch 9/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - loss: 1.0414e-04
Epoch 10/25
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38m

ValueError: non-broadcastable output operand with shape (159,1) doesn't match the broadcast shape (159,2)

In [None]:
print(valid_preds)

In [None]:
# 计算验证集上的RMSE
valid_rmse = np.sqrt(mean_squared_error(valid_set['Close/Last'].values[time_steps:], valid_preds))
print(f"LSTM RMSE on validation set: {valid_rmse}")

# 可视化LSTM结果
plt.figure(figsize=(14, 7))
plt.plot(train_set['Date'], train_set['Close/Last'], label='Training Data')
plt.plot(valid_set['Date'][time_steps:], valid_set['Close/Last'].values[time_steps:], label='Validation Data')
plt.plot(valid_set['Date'][time_steps:], valid_preds, label='Validation Predictions')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title('Stock Price Prediction with LSTM')
plt.legend()
plt.show()