In [185]:
import pandas as pd
import numpy as np
import random
import torch
import os
def custom_set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
custom_set_seed(42)
df = pd.read_csv("combined_dataset.csv")
#df = df.drop(columns = "Unnamed: 0")
df = df.iloc[58:]
df['Date'] = pd.to_datetime(df['Unnamed: 0']).dt.date
df['Time'] = pd.to_datetime(df['Unnamed: 0']).dt.time
df = df.drop(columns = "Unnamed: 0")
df = df[df['Date'] != pd.to_datetime('2023-04-26').date()]
data_dict = {}
for date, group in df.groupby('Date'):
    data_dict[date] = group.drop(columns=['Date', 'Time']).to_numpy()

# Create a list to store the unique times
unique_times = df['Time'].unique()

# Create a 3D NumPy array
num_dates = len(data_dict)
list_copy = list(data_dict.keys())[:]
random.shuffle(list_copy)

train_dates = list_copy[:192]
train_dates.sort()

test_dates = list_copy[192:]
test_dates.sort()

train_dict={}
for date in train_dates:
    train_dict[date] = data_dict[date]
test_dict={}
for date in test_dates:
    test_dict[date] = data_dict[date]

num_times = len(unique_times)

num_features = len(df.columns) - 2  # Subtract 2 for Date and Time columns

train_set = np.zeros((len(train_dates), num_times, num_features))

date_index_mapping = {date: i for i, date in enumerate(train_dict.keys())}
time_index_mapping = {time: j for j, time in enumerate(unique_times)}

for date, time_values in train_dict.items():
        for time, values in zip(df[df['Date'] == date]['Time'], time_values):
            i = date_index_mapping[date]
            j = time_index_mapping[time]
            train_set[i, j, :] = values

test_set = np.zeros((len(test_dates), num_times, num_features))

date_index_mapping = {date: i for i, date in enumerate(test_dict.keys())}
time_index_mapping = {time: j for j, time in enumerate(unique_times)}

for date, time_values in test_dict.items():
        for time, values in zip(df[df['Date'] == date]['Time'], time_values):
            i = date_index_mapping[date]
            j = time_index_mapping[time]
            test_set[i, j, :] = values



In [186]:
import pandas as pd
daily = pd.read_csv("daily_dataset.csv")
daily['SENTIMENT_Neg'] = daily['SENTIMENT_Neg'].fillna(0)
daily['SENTIMENT_Neu'] = daily['SENTIMENT_Neu'].fillna(1)
daily['SENTIMENT_Pos'] = daily['SENTIMENT_Pos'].fillna(0)
daily = daily.iloc[1:]
daily['Date'] = pd.to_datetime(daily['Unnamed: 0']).dt.date
daily = daily.drop(columns = "Unnamed: 0")
train_daily = daily[daily['Date'].isin(train_dates)].drop(columns = "Date")
test_daily = daily[~daily['Date'].isin(train_dates)].drop(columns = "Date")


In [187]:
import pandas as pd
y = pd.read_csv("target_dataset.csv")
y['Date'] = pd.to_datetime(y['Unnamed: 0']).dt.date
y = y.drop(columns = "Unnamed: 0")
y_train = y[y['Date'].isin(train_dates)].drop(columns = "Date")
y_test = y[y['Date'].isin(test_dates)].drop(columns = "Date")
print(len(y_train))
print(len(y_test))

192
48


In [188]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, MultiHeadAttention, LayerNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

main_input = Input(shape=(79, 112))  
lstm_out = LSTM(64, return_sequences=True)(main_input)  # Using return_sequences=True for attention


attention_out = MultiHeadAttention(num_heads=4, key_dim=64)(lstm_out, lstm_out)


attention_out = LayerNormalization()(attention_out)
attention_out = Dropout(0.05)(attention_out)


attention_lstm_out = LSTM(32, return_sequences=False)(attention_out)


additional_input = Input(shape=(1, 13))  
additional_lstm_out = LSTM(32, return_sequences=False)(additional_input)


merged = Concatenate()([attention_lstm_out, additional_lstm_out])


dense_out = Dense(64, activation='relu')(merged)
output = Dense(1, activation='linear')(dense_out)  


model = Model(inputs=[main_input, additional_input], outputs=output)


model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
train_daily_array = train_daily.to_numpy().reshape(192, 1, 13)
train_set = train_set.astype('float32')
train_daily_array = train_daily_array.astype('float32')
y_train = y_train.astype('float32')
scaler1 = StandardScaler()
scaler2 = StandardScaler()

train_set_scaled = scaler1.fit_transform(train_set.reshape(-1, train_set.shape[-1])).reshape(train_set.shape)
train_daily_scaled = scaler2.fit_transform(train_daily_array.reshape(-1, train_daily_array.shape[-1])).reshape(train_daily_array.shape)
# Train the model
model.fit([train_set, train_daily_array], y_train, epochs=1000, batch_size=16)




Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x33018eda0>

In [191]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Reshape and convert test data types
test_daily_array = test_daily.to_numpy().reshape(48, 1, 13)
test_set_reshaped = test_set.reshape(-1, test_set.shape[-1])
test_daily_reshaped = test_daily_array.reshape(-1, test_daily_array.shape[-1])

# Apply the same scaler used on the training data
test_set_scaled = scaler1.transform(test_set_reshaped).reshape(test_set.shape)
test_daily_scaled = scaler2.transform(test_daily_reshaped).reshape(test_daily_array.shape)

# Convert back to original shape if necessary
test_daily_array_scaled = test_daily_scaled.reshape(48, 1, 13)
test_set_scaled = test_set_scaled.astype('float32')
test_daily_array_scaled = test_daily_array_scaled.astype('float32')
y_test = y_test.astype('float32')

# Make predictions with the model
predictions = model.predict([test_set_scaled, test_daily_array_scaled])

# Calculate metrics
mse = mean_squared_error(y_test, predictions)
r_squared = r2_score(y_test, predictions)

print("R-squared:", r_squared)
print("Mean Squared Error:", mse)

R-squared: -1358362.6981128608
Mean Squared Error: 0.0879561
