In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import MinMaxScaler
# Separate scaler for the target variable
target_scaler = MinMaxScaler()
feature_scaler = MinMaxScaler()

In [3]:
df = pd.read_csv("cleaned.csv")

In [4]:
date = df['Unnamed: 0']

In [5]:
date

0       1999-01-22
1       1999-01-23
2       1999-01-24
3       1999-01-25
4       1999-01-26
           ...    
9632    2025-06-06
9633    2025-06-07
9634    2025-06-08
9635    2025-06-09
9636    2025-06-10
Name: Unnamed: 0, Length: 9637, dtype: object

In [6]:
df = pd.read_csv("final_data.csv")

In [7]:
# Create lagged features for Close_diff_NVDA

lags_to_add = [1, 2, 3]  # Specify the lags to include

for lag in lags_to_add:
    df[f'Lag_Close_diff_{lag}'] = df['Close_NVDA'].shift(lag)

In [8]:
df = df.bfill()

In [9]:
df['Close_NVDA'] = target_scaler.fit_transform(df[['Close_NVDA']])  # Target variable
df[df.columns.difference(['Close_NVDA'])] = feature_scaler.fit_transform(df[df.columns.difference(['Close_NVDA'])])

In [10]:
df

Unnamed: 0.1,Unnamed: 0,Open_NVDA,Lag_Close_1,MA10_,EMA10_,MA50_,Lag_Close_3,Close_NVDA,Lag_Close_diff_1,Lag_Close_diff_2,Lag_Close_diff_3
0,0.000000,0.000053,0.000042,0.000034,0.000034,0.000039,0.000042,0.000042,0.000042,0.000042,0.000042
1,0.000104,0.000056,0.000042,0.000034,0.000039,0.000039,0.000042,0.000069,0.000042,0.000042,0.000042
2,0.000208,0.000056,0.000069,0.000034,0.000039,0.000039,0.000042,0.000069,0.000069,0.000042,0.000042
3,0.000311,0.000056,0.000069,0.000034,0.000039,0.000039,0.000042,0.000069,0.000069,0.000069,0.000042
4,0.000415,0.000066,0.000069,0.000034,0.000039,0.000039,0.000069,0.000047,0.000069,0.000069,0.000069
...,...,...,...,...,...,...,...,...,...,...,...
9632,0.999585,0.931327,0.936900,0.945295,0.953488,0.845836,0.945133,0.948480,0.936900,0.949819,0.945133
9633,0.999689,0.935772,0.948480,0.953075,0.959206,0.850295,0.949819,0.954572,0.948480,0.936900,0.949819
9634,0.999792,0.935772,0.954572,0.953075,0.959206,0.850295,0.936900,0.954572,0.954572,0.948480,0.936900
9635,0.999896,0.935772,0.954572,0.953075,0.959206,0.850295,0.948480,0.954572,0.954572,0.954572,0.948480


In [11]:
# Ensure the target column is defined
target_column = 'Close_NVDA'
sequence_length = 10

# Get the index of the target column
target_index = df.columns.get_loc(target_column)

In [12]:
# Prepare sequences
X, y = [], []

for i in range(len(df) - sequence_length):
    X.append(df.iloc[i:i + sequence_length].values)  # Input sequence
    y.append(df.iloc[i + sequence_length, target_index])  # Target value

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

In [13]:
X.shape, y.shape

((9627, 10, 11), (9627,))

In [14]:
# Convert the DataFrame to a NumPy array, excluding the index
data_values = df.values

In [15]:
# Initialize as Python lists
X_train, X_val, X_test = [], [], []
y_train, y_val, y_test = [], [], []

# Split data manually into train, validation, and test sets
train_size = int(len(data_values) * 0.7)
val_size = int(len(data_values) * 0.85)

for i in range(len(data_values) - sequence_length):
    if i + sequence_length <= train_size:
        # Add to training data
        X_train.append(data_values[i:i + sequence_length])  # Input sequence
        y_train.append(data_values[i + sequence_length, target_index])  # Target value
    elif i + sequence_length <= val_size:
        # Add to validation data
        X_val.append(data_values[i:i + sequence_length])
        y_val.append(data_values[i + sequence_length, target_index])
    else:
        # Add to test data
        X_test.append(data_values[i:i + sequence_length])
        y_test.append(data_values[i + sequence_length, target_index])

# Convert to NumPy arrays after appending
X_train, X_val, X_test = np.array(X_train), np.array(X_val), np.array(X_test)
y_train, y_val, y_test = np.array(y_train), np.array(y_val), np.array(y_test)

In [16]:
# Calculate the indices for splitting
train_size = int(len(date) * 0.7)
val_size = int(len(date) * 0.85)

# Split the date column into training, validation, and test sets
date_train = date[:train_size]
date_val = date[train_size:val_size]
date_test = date[val_size:]

# Show the splits for the date column
(date_train.head(), date_val.head(), date_test.head())

(0    1999-01-22
 1    1999-01-23
 2    1999-01-24
 3    1999-01-25
 4    1999-01-26
 Name: Unnamed: 0, dtype: object,
 6745    2017-07-11
 6746    2017-07-12
 6747    2017-07-13
 6748    2017-07-14
 6749    2017-07-15
 Name: Unnamed: 0, dtype: object,
 8191    2021-06-26
 8192    2021-06-27
 8193    2021-06-28
 8194    2021-06-29
 8195    2021-06-30
 Name: Unnamed: 0, dtype: object)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras import Input
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.optimizers import Adam

In [18]:
# Prepare the data for LSTM
sequence_length = 10
X, y = [], []

In [19]:
def create_lstm_model(units_layer1=64, units_layer2=32, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential([
        Input(shape=(X_train.shape[1], X_train.shape[2])),
        LSTM(64, activation='tanh', input_shape=(X_train.shape[1], X_train.shape[2])),
        Dense(1)  # Single output
    ])
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

In [20]:
model = create_lstm_model(units_layer1=128, units_layer2=64, dropout_rate=0.2, learning_rate=0.001)
model.summary()

  super().__init__(**kwargs)


In [21]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [22]:
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1, validation_data=(X_val, y_val))

Epoch 1/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 3.1278e-04 - val_loss: 3.3223e-05
Epoch 2/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5.7496e-07 - val_loss: 1.7162e-05
Epoch 3/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.9061e-07 - val_loss: 1.2651e-05
Epoch 4/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.0406e-07 - val_loss: 9.5972e-06
Epoch 5/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 7.2768e-08 - val_loss: 8.1654e-06
Epoch 6/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8.9059e-08 - val_loss: 7.1520e-06
Epoch 7/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 7.3375e-08 - val_loss: 1.2287e-05
Epoch 8/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.0561e-07

<keras.src.callbacks.history.History at 0x240373a2ed0>

In [23]:
# Unscale y_test and predictions using the target_scaler
y_test_unscaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))
predictions_unscaled = target_scaler.inverse_transform(model.predict(X_test).reshape(-1, 1))

# Calculate MSE and MAE on unscaled data
mse = mean_squared_error(y_test_unscaled, predictions_unscaled)
mae = mean_absolute_error(y_test_unscaled, predictions_unscaled)

print(f"Unscaled Test MSE: {mse}")
print(f"Unscaled Test MAE: {mae}")

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Unscaled Test MSE: 66.55397953777101
Unscaled Test MAE: 4.923286034079159
