# Part 1: Sentiment Analysis

In [1]:
import pandas as pd

# Load the CSV file
file_path = './data/NEWS_YAHOO_stock_prediction.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()

Unnamed: 0.1,Unnamed: 0,ticker,Date,category,title,content,Open,High,Low,Close,Adj Close,Volume,label
0,0,AAPL,2020-01-27,opinion,Apple Set To Beat Q1 Earnings Estimates Tech ...,Technology giant Apple NASDAQ AAPL is set ...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000,0
1,1,AAPL,2020-01-27,opinion,Tech Daily Intel Results Netflix Surge Appl...,The top stories in this digest are Intel s N...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000,0
2,2,AAPL,2020-01-27,opinion,7 Monster Stock Market Predictions For The Wee...,S P 500 SPY \nThis week will be packed with e...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000,0
3,3,AAPL,2020-01-27,opinion,Apple Earnings Preview 5G Launch Expanding S...,Reports Q1 2020 results on Tuesday Jan 28 ...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000,0
4,4,AAPL,2020-01-27,opinion,Buy Surging Apple Microsoft Stock Before Qua...,On today s episode of Full Court Finance here ...,77.514999,77.942497,76.220001,77.237503,75.793358,161940000,0


In [2]:
data[['Date', 'category', 'content']].head(10)

Unnamed: 0,Date,category,content
0,2020-01-27,opinion,Technology giant Apple NASDAQ AAPL is set ...
1,2020-01-27,opinion,The top stories in this digest are Intel s N...
2,2020-01-27,opinion,S P 500 SPY \nThis week will be packed with e...
3,2020-01-27,opinion,Reports Q1 2020 results on Tuesday Jan 28 ...
4,2020-01-27,opinion,On today s episode of Full Court Finance here ...
5,2020-01-27,opinion,Monday January 27 2020The Zacks Research Dai...
6,2020-01-27,news,By Peter Nurse \nInvesting com European stoc...
7,2020-01-27,news,BTIG analyst Mark Palmer initiates coverage of...
8,2020-01-24,opinion,I got a great question recently from Barbara P...
9,2020-01-24,opinion,We get into the heart of the Q4 earnings seaso...


In [3]:
# Step 1: Remove unnecessary column
data.drop(columns=['Unnamed: 0'], inplace=True)

# Step 2: Remove duplicate texts
data.drop_duplicates(subset=['title', 'content'], inplace=True)

# Step 3: Remove rows with large amount of spaces or empty texts in 'title' and 'content'
data = data[~data['title'].str.isspace()]
data = data[~data['content'].str.isspace()]
data.dropna(subset=['title', 'content'], inplace=True)

# Check the dataframe after these preprocessing steps
data.info()

# Step 5: Check for invalid numeric data
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
data[numeric_columns].describe()


<class 'pandas.core.frame.DataFrame'>
Index: 15965 entries, 0 to 15974
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ticker     15965 non-null  object 
 1   Date       15965 non-null  object 
 2   category   15965 non-null  object 
 3   title      15965 non-null  object 
 4   content    15965 non-null  object 
 5   Open       15965 non-null  float64
 6   High       15965 non-null  float64
 7   Low        15965 non-null  float64
 8   Close      15965 non-null  float64
 9   Adj Close  15965 non-null  float64
 10  Volume     15965 non-null  int64  
 11  label      15965 non-null  int64  
dtypes: float64(5), int64(2), object(5)
memory usage: 1.6+ MB


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,15965.0,15965.0,15965.0,15965.0,15965.0,15965.0
mean,40.583061,40.952148,40.241173,40.605005,38.739098,153646300.0
std,11.884583,11.980327,11.799389,11.89182,12.15832,109603300.0
min,13.856071,14.271429,13.753571,13.9475,12.084597,45448000.0
25%,31.522499,31.772499,31.264999,31.475,28.576729,95174000.0
50%,40.9375,41.432499,40.602501,41.0,39.263371,121150800.0
75%,47.125,47.424999,46.695,47.037498,45.263882,169126400.0
max,80.0625,80.832497,79.379997,79.807503,78.315315,1460852000.0


In [4]:
# (optional) set proxy
import subprocess
import os

result = subprocess.run('bash -c "source ~/clash_dir/set && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
output
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load the FinBERT model and tokenizer
checkpoint = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# Create a pipeline for sentiment analysis
# Do not truncate the original text
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)

2023-12-01 22:31:42.750261: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-01 22:31:42.795731: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 22:31:42.795753: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 22:31:42.797140: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 22:31:42.806481: I tensorflow/core/platform/cpu_feature_guar

In [6]:
# 滑动窗口方法
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
# from tqdm.auto import tqdm
from tqdm import tqdm

# 根据token数量来切分
def sliding_window(text, max_len, overlap, tokenizer):
    """
    Args:
    text (str): the text to be split into chunks
    max_len (int): the maximum length of each chunk
    overlap (int): the number of overlapped tokens between chunks
    tokenizer: the tokenizer used to tokenize the text

    Returns:
    list of str: the list of text chunks
    """
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_len - overlap):
        chunk = tokens[i:i + max_len]
        chunk = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk)
    return chunks

def process_batch(texts, nlp, max_len, overlap, tokenizer):
    sentiments = []
    for text in texts:
        # Apply sliding window to the text
        text_chunks = sliding_window(text, max_len, overlap, tokenizer)

        # Apply sentiment analysis to each chunk
        chunk_sentiments = []
        chunk_weights = []
        for chunk in text_chunks:
            result = nlp(chunk)
            sentiment = result[0]['label']
            chunk_sentiments.append(sentiment)
            # Use the length of the chunk as the weight
            weight = len(chunk)
            chunk_weights.append(weight)

        # Combine the sentiments using weighted voting
        sentiment_counter = Counter()
        for sentiment, weight in zip(chunk_sentiments, chunk_weights):
            sentiment_counter[sentiment] += weight
        final_sentiment = sentiment_counter.most_common(1)[0][0]
        sentiments.append(final_sentiment)
    return sentiments

def apply_sentiment_analysis_parallel(df, nlp, tokenizer, text_column='content', max_len=500, overlap=50, num_workers=16, batch_size=10):
    """
    Apply sentiment analysis to a column in a dataframe using sliding window method.
    
    Args:
    df (pd.DataFrame): Dataframe containing the text data.
    nlp (pipeline): HuggingFace pipeline for sentiment analysis.
    tokenizer: the tokenizer used to tokenize the text
    text_column (str): Name of the column containing text data.
    max_len (int): The maximum length of each text chunk.
    overlap (int): The number of overlapped tokens between chunks.
    num_workers (int): The number of threads to use for parallel processing.
    batch_size (int): The number of texts to process in each batch.

    Returns:
    pd.DataFrame: Dataframe with a new column 'sentiment' containing the analysis results.
    """
    # Break the texts into batches
    text_batches = [df[text_column][i:i + batch_size] for i in range(0, len(df), batch_size)]

    sentiments = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(process_batch, batch, nlp, max_len, overlap, tokenizer) for batch in text_batches]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            try:
                sentiments.extend(future.result())
            except Exception as e:
                print(f"Error in processing text: {e}")
                sentiments.extend(['Error'] * batch_size)

    # Add the sentiments as a new column in the dataframe
    df['sentiment'] = sentiments
    return df

In [10]:
# Step 1: Apply sentiment analysis to the dataset
data = apply_sentiment_analysis_parallel(data, nlp, tokenizer)

# Step 2: Prepare data for the prediction model
# Here we'll assume the sentiment analysis has been applied and 'sentiment' column is added to the data

# We might want to convert sentiments to numerical values for model training
sentiment_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
data['sentiment_numeric'] = data['sentiment'].map(sentiment_mapping)



In [None]:
# Example code to save the processed DataFrame to a CSV file
data.to_csv('./data/dataset_with_sentiment.csv', index=False)

In [None]:
# Adjust display settings for better visualization of samples
pd.set_option('display.max_colwidth', 200)  # Adjust the width to fit longer texts

# Display some random samples with formatted output
sample_data = data.sample(n=10)[['content', 'sentiment']]

# Print each sample in a more readable format
for index, row in sample_data.iterrows():
    print(f"Sample {index}:")
    print(f"Content: {row['content']}")
    print(f"Sentiment: {row['sentiment']}\n")


In [None]:
# Assuming 'data' is your DataFrame with 'sentiment' and 'label' columns
# Calculate the proportion of each sentiment category
sentiment_counts = data['sentiment'].value_counts(normalize=True) * 100

# Calculate the proportion of each label
label_counts = data['label'].value_counts(normalize=True) * 100

# Print the results
print("Sentiment Distribution (%):")
print(sentiment_counts)
print("\nLabel Distribution (%):")
print(label_counts)

# For additional insights, we can also look at the cross-tabulation of sentiment and label
crosstab = pd.crosstab(data['sentiment'], data['label'], normalize='index') * 100
print("\nCross-Tabulation of Sentiment and Label (%):")
print(crosstab)

In [None]:
# read for existed csv
import pandas as pd
data = pd.read_csv('./data/dataset_with_sentiment.csv')

# Convert the 'Date' column to datetime format and sort the dataframe by 'Date'
data['Date'] = pd.to_datetime(data['Date'])
data_sorted = data.sort_values(by='Date')

In [None]:
# 按 'Date' 和 'sentiment' 分组，然后计算每个类别的 category 为news和opinion的数量
category_news_per_day_sentiment = data_sorted[data_sorted['category'] == 'news'].groupby(['Date', 'sentiment']).size().unstack().fillna(0)
category_opinion_per_day_sentiment = data_sorted[data_sorted['category'] == 'opinion'].groupby(['Date', 'sentiment']).size().unstack().fillna(0)
# 分别计算news和opinion的total
category_news_total_per_day_sentiment = data_sorted[data_sorted['category'] == 'news'].groupby(['Date']).size()
category_opinion_total_per_day_sentiment = data_sorted[data_sorted['category'] == 'opinion'].groupby(['Date']).size()

data_sorted = data_sorted.set_index('Date')
data_sorted['P_news_pos'] = category_news_per_day_sentiment['Positive'].reindex(data_sorted.index) / category_news_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted['P_news_neg'] = category_news_per_day_sentiment['Negative'].reindex(data_sorted.index) / category_news_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted['P_op_pos'] = category_opinion_per_day_sentiment['Positive'].reindex(data_sorted.index) / category_opinion_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted['P_op_neg'] = category_opinion_per_day_sentiment['Negative'].reindex(data_sorted.index) / category_opinion_total_per_day_sentiment.reindex(data_sorted.index)
data_sorted = data_sorted.reset_index()

In [None]:
daily_data = data_sorted.groupby('Date').last()

# Shift the 'Open' column to get the next day's opening price
daily_data['Next_Open'] = daily_data['Open'].shift(-1)

# Drop the last row as it will not have a 'Next_Open' value
daily_data = daily_data[:-1]

In [None]:
date_to_query = pd.to_datetime('2016-10-28')
daily_data.loc[(date_to_query)]

In [None]:
date_to_query = pd.to_datetime('2016-10-28')
data_sorted.loc[data_sorted['Date'] == date_to_query]

In [None]:
date_to_query = pd.to_datetime('2020-01-23')
daily_data.loc[(date_to_query)]

In [None]:
date_to_query = pd.to_datetime('2018-05-06')
category_to_query = 'news'
data_sorted.loc[(data_sorted['Date'] == date_to_query) & (data_sorted['category'] == category_to_query)]

In [None]:
daily_data['P_news_neg'].fillna(0, inplace=True)
daily_data['P_news_pos'].fillna(0, inplace=True)
daily_data['P_op_neg'].fillna(0, inplace=True)
daily_data['P_op_pos'].fillna(0, inplace=True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the dataset into a Pandas DataFrame
historical_data = pd.read_csv('data/AAPL_Yahoo_Correct.csv')

# Convert the 'Date' column to datetime format
historical_data['Date'] = pd.to_datetime(historical_data['Date'])

# Plotting the 'Open' price against the 'Date'
plt.figure(figsize=(10, 5))
plt.plot(historical_data['Date'], historical_data['Open'], label='AAPL Open Price')
plt.xlabel('Date')
plt.ylabel('Open Price ($)')
plt.title('AAPL Stock Open Price Over Time')
plt.legend()
plt.show()


In [None]:
# 按照Date将historical_data的全部列和daily_data的这四个P_开头的列合并。如果出现有些天在daily_data中不存在，则四个P_开头的列在这一天都置为0。
daily_data_merged = pd.merge(historical_data, daily_data[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']], left_on='Date', right_index=True, how='left')
daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']] = daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']].fillna(0)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(daily_data_merged['Date'], daily_data_merged['Open'], label='AAPL Open Price')
plt.xlabel('Date')
plt.ylabel('Open Price ($)')
plt.title('AAPL Stock Open Price Over Time')
plt.legend()
plt.show()

In [None]:
daily_data_merged.to_csv('./data/dataset_FinBERT.csv')

# Part 1.1: VADER Sentiment Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the dataset into a Pandas DataFrame
historical_data = pd.read_csv('data/AAPL_Yahoo_Correct.csv')

# Convert the 'Date' column to datetime format
historical_data['Date'] = pd.to_datetime(historical_data['Date'])

# Plotting the 'Open' price against the 'Date'
plt.figure(figsize=(10, 5))
plt.plot(historical_data['Date'], historical_data['Open'], label='AAPL Open Price')
plt.xlabel('Date')
plt.ylabel('Open Price ($)')
plt.title('AAPL Stock Open Price Over Time')
plt.legend()
plt.show()

In [None]:
# read VADER results from csv
# vader_daily_results = pd.read_csv('./data/VADER_results.csv')
# vader_daily_results = pd.read_csv('./data/combined_data_mean5.csv')
# vader_daily_results = pd.read_csv('./data/combined_data_mean_first512.csv')
vader_daily_results = pd.read_csv('./data/proportion_data.csv')

In [None]:
vader_daily_results['Date'] = pd.to_datetime(vader_daily_results['Date'])
vader_daily_results = vader_daily_results[['Date', 'news_neg', 'news_pos', 'opinion_neg', 'opinion_pos']]
vader_daily_results.columns = ['Date', 'P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']
vader_daily_results.set_index('Date', inplace=True)

daily_data_merged = pd.merge(historical_data, vader_daily_results[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']], left_on='Date', right_index=True, how='left')
daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']] = daily_data_merged[['P_news_neg', 'P_news_pos', 'P_op_neg', 'P_op_pos']].fillna(0)

In [None]:
daily_data_merged.head(10)

In [None]:
daily_data_merged.to_csv('./data/dataset_VADER.csv')

# Part 2: Stock price prediction

In [None]:
import pandas as pd
# FinBERT
daily_data_merged = pd.read_csv('./data/dataset_FinBERT.csv', index_col=0)
# VADER
# daily_data_merged = pd.read_csv('./data/dataset_VADER.csv', index_col=0)

- Showing sentiment analysis results

In [None]:
daily_sentiment_results = daily_data_merged[['Date', 'P_news_pos', 'P_news_neg', 'P_op_pos', 'P_op_neg']]
import seaborn as sns
import matplotlib.pyplot as plt

# Split the data into two parts: news sentiment analysis results and opinion sentiment analysis results
news_sentiment_results = daily_sentiment_results[['Date', 'P_news_pos', 'P_news_neg']]
opinion_sentiment_results = daily_sentiment_results[['Date', 'P_op_pos', 'P_op_neg']]

# Ensure 'Date' is in news_sentiment_results and opinion_sentiment_results
assert 'Date' in news_sentiment_results.columns
assert 'Date' in opinion_sentiment_results.columns

# Set 'Date' column as index
news_sentiment_results.set_index('Date', inplace=True)
opinion_sentiment_results.set_index('Date', inplace=True)

# Draw a heatmap for news sentiment analysis results
plt.figure(figsize=(10, 5))
plt.title('News Sentiment Analysis Results')
sns.heatmap(news_sentiment_results.tail(5), annot=True, cmap='YlGnBu', fmt=".3f")
plt.show()

# Draw a heatmap for opinion sentiment analysis results
plt.figure(figsize=(10, 5))
plt.title('Opinion Sentiment Analysis Results')
sns.heatmap(opinion_sentiment_results.tail(5), annot=True, cmap='YlGnBu', fmt=".3f")
plt.show()


In [None]:
# 选择特征和目标
# 保留原本的index，将'Date'列单独提取出来保存
date = daily_data_merged['Date']
date = pd.to_datetime(date)

features = daily_data_merged.drop(['Date'], axis=1)
# Open作为预测目标
target = daily_data_merged['Open']
features.tail()

In [None]:
features.head()

In [None]:
target.tail()

In [None]:
# normalization
from sklearn.preprocessing import MinMaxScaler

# Apply the MinMaxScaler to the features and target
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

# fit_transform根据数据计算缩放参数
scaled_features = scaler_features.fit_transform(features)
scaled_target = scaler_target.fit_transform(target.values.reshape(-1, 1))

# 保存缩放参数
import joblib
joblib.dump(scaler_features, './model/scaler_features.pkl')
joblib.dump(scaler_target, './model/scaler_target.pkl')

# Create new DataFrames with the scaled features and target
scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)
scaled_target_df = pd.DataFrame(scaled_target, columns=['Open'])

In [None]:
scaled_features.shape

In [None]:
scaled_target.shape

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.dates as mdates

In [None]:
def create_sequences(features, targets, seq_length):
    """
    Create sequences of specified length from time series data.

    Args:
    features (np.array): The feature data.
    targets (np.array): The target data.
    seq_length (int): The length of the sequence.

    Returns:
    np.array: Sequences of features.
    np.array: Corresponding targets for each sequence.
    """
    xs, ys = [], []
    for i in range(len(features) - seq_length):
        x = features[i:(i + seq_length)]
        y = targets[i + seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [None]:
# sequence length
seq_length = 25

# Create sequences
features_seq, target_seq = create_sequences(scaled_features, scaled_target, seq_length)


In [None]:
train_features, test_features, train_target, test_target = train_test_split(
    features_seq, target_seq, test_size=0.2, random_state=42
)

val_features, test_features, val_target, test_target = train_test_split(
    test_features, test_target, test_size=0.5, random_state=42
)

- 准备训练

In [None]:
# Convert sequences to Tensor
train_features = torch.tensor(train_features, dtype=torch.float32)
train_target = torch.tensor(train_target, dtype=torch.float32)

val_features = torch.tensor(val_features, dtype=torch.float32)
val_target = torch.tensor(val_target, dtype=torch.float32)

test_features = torch.tensor(test_features, dtype=torch.float32)
test_target = torch.tensor(test_target, dtype=torch.float32)

# 创建TensorDataset
train_dataset = TensorDataset(train_features, train_target)
val_dataset = TensorDataset(val_features, val_target)
test_dataset = TensorDataset(test_features, test_target)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# 使用普通的LSTM模型，不使用注意力机制
class SimpleLSTM(nn.Module):
    def __init__(self, input_dim, hidden_size, num_layers, output_dim, dropout=0.2):
        super(SimpleLSTM, self).__init__()
        self.hidden_size = hidden_size

        # LSTM层
        self.lstm = nn.LSTM(input_dim, hidden_size, num_layers, 
                            batch_first=True, dropout=dropout if num_layers > 1 else 0)
        
        # 全连接层
        self.fc = nn.Linear(hidden_size, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # 打印lstm_out的形状
        # print(lstm_out.shape)
        # 取最后一个时间步的输出
        output = self.fc(lstm_out[:, -1, :])
        return output

In [None]:
# # 使用注意力机制的LSTM
# class Attention(nn.Module):
#     def __init__(self, hidden_size):
#         super(Attention, self).__init__()
#         self.hidden_size = hidden_size
#         self.attn = nn.Linear(self.hidden_size, 1)

#     def forward(self, hidden, encoder_outputs):
#         attn_weights = torch.tanh(self.attn(encoder_outputs))
#         return torch.bmm(attn_weights.transpose(1, 2), encoder_outputs).squeeze(1)

# class AttentionLSTM(nn.Module):
#     def __init__(self, input_dim, hidden_size, num_layers, output_dim, dropout=0.2):
#         super(AttentionLSTM, self).__init__()
#         self.hidden_size = hidden_size

#         # LSTM层
#         self.lstm = nn.LSTM(input_dim, hidden_size, num_layers, 
#                             batch_first=True, dropout=dropout if num_layers > 1 else 0)
        
#         # 注意力层
#         self.attention = Attention(hidden_size)
        
#         # 全连接层
#         self.fc = nn.Linear(hidden_size, output_dim)

#     def forward(self, x):
#         lstm_out, _ = self.lstm(x)
#         attn_out = self.attention(lstm_out[:, -1, :], lstm_out)
#         output = self.fc(attn_out)
#         return output

In [None]:
# 超参数
input_dim = scaled_features_df.shape[1]  # 特征数量
hidden_size = 100  # 隐藏状态中的特征数量，可以调整
num_layers = 4    # 堆叠的LSTM层的数量
output_dim = 1    # 输出维度的数量（预测一个值）

# 使用SimpleLSTM
model = SimpleLSTM(input_dim, hidden_size, num_layers, output_dim, dropout=0.2)
# 使用AttentionLSTM
# model = AttentionLSTM(input_dim, hidden_size, num_layers, output_dim, dropout=0.2)

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# 训练模型
num_epochs = 50
best_val_loss = float('inf')
train_loss_list = []
val_loss_list = []
for epoch in range(num_epochs):
    model.train()
    train_losses = []
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    model.eval()
    val_losses = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_losses.append(loss.item())

    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)

    # 保存最佳模型
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), './model/best_model.pth')
        print('best_model updated at epoch {}, best_val_loss : {:.4f}'.format(epoch+1, best_val_loss))
        
    # 每5轮打印一次train loss和val loss
    if epoch % 5 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

    # 记录两个loss
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    
# 在所有epochs结束后绘制损失图
plt.figure(figsize=(10, 6))
plt.plot(range(num_epochs), train_loss_list, label='Train Loss', color='blue')
plt.plot(range(num_epochs), val_loss_list, label='Validation Loss', color='red')
plt.title('Train Loss and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# 加载最佳模型
model.load_state_dict(torch.load('./model/best_model.pth'))

# 计算新的测试集的大小
test_size_new = int(len(features_seq) * 0.05)

# 按时间顺序划分新的测试集
test_features_new, test_target_new = features_seq[-test_size_new:], target_seq[-test_size_new:]

# 使用模型进行预测
model.eval()
with torch.no_grad():
    test_predictions_new = model(torch.tensor(test_features_new, dtype=torch.float32)).numpy()

# 反缩放预测值
test_predictions_new = scaler_target.inverse_transform(test_predictions_new).flatten()

# 反缩放真实目标值
test_target_new = scaler_target.inverse_transform(test_target_new.reshape(-1, 1)).flatten()

# 计算日期的总长度
total_length = len(date)

# 计算测试集的开始位置
test_start = total_length - test_size_new

# 计算新的测试集的结束位置
test_end = total_length

# 获取新的测试集的日期范围
test_date_new = date[test_start:test_end]

# Print the date range of the new test set
print("The date range of the new test set is from", test_date_new.iloc[0], "to", test_date_new.iloc[-1])

# Print the length of the new test set
print("The length is", len(test_target_new))

# 绘制实际股价和预测股价的对比图
plt.figure(figsize=(10, 6))
plt.plot(date[test_start:], test_target_new, label='Actual Prices', color='blue')
plt.plot(date[test_start:], test_predictions_new, label='Predicted Prices', color='red')

# 设置x轴的日期格式
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=10))  # 设置日期间隔

plt.title('Predicted vs Actual Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.gcf().autofmt_xdate()  # 自动调整x轴日期标签的角度以提高可读性
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

# 计算MSE
mse = mean_squared_error(test_target_new, test_predictions_new)
print('Test MSE: ', mse)
# 计算RMSE
rmse = np.sqrt(mse)
print('Test RMSE: ', rmse)

- 50个epoch RMSE测试

In [None]:
import pandas as pd

# 创建一个数据框来显示序列长度和对应的RMSE
seq_rmse_df = pd.DataFrame({
    'Seq_length': [10, 15, 20, 25, 30, 35, 40, 45, 50],
    'RMSE': [1.1130488933158105, 0.8456300018583505, 0.8641183126757913, 0.8094936259619585, 0.9580832258849209, 0.8409732222581049, 0.9262937021912081, 1.1064503987204368, 0.9005207820679706]
})

# 将RMSE保留两位小数
seq_rmse_df['RMSE'] = seq_rmse_df['RMSE'].round(3)
seq_rmse_df = seq_rmse_df.reset_index(drop=True)

import matplotlib.cm
from plottable import ColumnDefinition, Table

# 创建列定义
seq_length_col_def = ColumnDefinition('Seq_length', title='sequence length')
rmse_col_def = ColumnDefinition('RMSE', title='RMSE', cmap=matplotlib.cm.get_cmap('viridis'))

# 创建表格
table = Table(seq_rmse_df, column_definitions=[seq_length_col_def, rmse_col_def])

In [None]:
seq_lengths = [10, 15, 20, 25, 30, 35, 40, 45, 50]
rmse_results = [1.1130488933158105, 0.8456300018583505, 0.8641183126757913, 0.8094936259619585, 0.9580832258849209, 0.8409732222581049, 0.9262937021912081, 1.1064503987204368, 0.9005207820679706]

# 绘制RMSE结果图
plt.figure(figsize=(10, 6))
plt.plot(seq_lengths, rmse_results, label='RMSE Results', color='blue')

# 标出最小RMSE点
min_rmse_index = np.argmin(rmse_results)
plt.plot(seq_lengths[min_rmse_index], rmse_results[min_rmse_index], 'ro')
plt.text(seq_lengths[min_rmse_index], rmse_results[min_rmse_index], f'Min RMSE: {rmse_results[min_rmse_index]:.3f}', fontsize=12, ha='right')

plt.title('RMSE Results for Different Sequence Lengths')
plt.xlabel('Sequence Length')
plt.ylabel('RMSE')
plt.legend()
plt.show()


In [None]:
import torch
print(torch.__version__)

In [None]:
data = {
    'Method': ['Historical transaction information', 'FinBERT mixed', 'VADER mixed'],
    'RMSE': [0.9793, 0.7341, 0.8601]
}
df = pd.DataFrame(data)

# 设置表格样式
sns.set_theme(style="whitegrid")

# 创建一个新的figure
plt.figure(figsize=(10, 6))

# 使用seaborn的barplot函数绘制条形图
sns.barplot(x='Method', y='RMSE', data=df, palette='viridis')

# 设置标题和坐标轴标签
plt.title('RMSE Results for Different Methods')
plt.xlabel('Method')
plt.ylabel('RMSE')

# 显示图形
plt.show()