In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Load csv
df = pd.read_csv('preprocessed_CAC40.csv', parse_dates=['Date'])

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Date,Open,Closing_Price,Daily_High,Daily_Low,Volume
0,0,Accor,2020-04-03,22.99,23.4,23.4,22.99,67
1,1,Accor,2020-04-02,23.91,22.99,23.91,22.99,250
2,2,Accor,2020-04-01,24.1,23.83,24.1,23.83,37
3,3,Accor,2020-03-31,25.04,25.0,25.24,24.99,336
4,4,Accor,2020-03-30,26.5,25.02,26.5,24.99,415


In [5]:
# Drop unnecessary columns
df1 = df.drop(['Unnamed: 0', 'Name'], axis=1)
df1.head()

Unnamed: 0,Date,Open,Closing_Price,Daily_High,Daily_Low,Volume
0,2020-04-03,22.99,23.4,23.4,22.99,67
1,2020-04-02,23.91,22.99,23.91,22.99,250
2,2020-04-01,24.1,23.83,24.1,23.83,37
3,2020-03-31,25.04,25.0,25.24,24.99,336
4,2020-03-30,26.5,25.02,26.5,24.99,415


In [6]:
# Convert date to datetime
df1['Date'] = pd.to_datetime(df1['Date'])

In [7]:
# Set date as index
df1 = df1.set_index('Date')

In [8]:
df1.isna().sum()

Open               230
Closing_Price        2
Daily_High         204
Daily_Low          204
Volume           20453
dtype: int64

In [10]:
from sklearn.impute import SimpleImputer

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
df1[['Open', 'Closing_Price', 'Daily_High', 'Daily_Low']] = imputer.fit_transform(df1[['Open', 'Closing_Price', 'Daily_High', 'Daily_Low']])

In [11]:
import numpy as np

df1['Volume'] = df1['Volume'].apply(lambda x: np.nan if not isinstance(x, (int, float)) else x)

In [12]:
imputer = SimpleImputer(strategy='most_frequent')
df1['Volume'] = imputer.fit_transform(df1['Volume'].values.reshape(-1, 1))



In [13]:
from sklearn.preprocessing import MinMaxScaler
# Scale data
scaler = MinMaxScaler()
df1['Closing_Price'] = scaler.fit_transform(df1['Closing_Price'].values.reshape(-1, 1))

In [14]:
# Create features and labels
X = []
y = []
window_size = 30  # Lookback window size

In [15]:
for i in range(window_size, len(df1) - 3):
    X.append(df1['Closing_Price'].values[i - window_size:i])
    y.append(df1['Closing_Price'].values[i:i + 3])

In [16]:
import numpy as np
X = np.array(X)
y = np.array(y)

In [17]:
# Split data into train and test sets
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [18]:
# Reshape data for LSTM
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
# Build and train LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(window_size, 1)))
model.add(Dense(3))  # Output shape: (None, 3) for 3-day prediction
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/100


  super().__init__(**kwargs)


[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 11ms/step - loss: 5.3379e-04 - val_loss: 1.4993e-05
Epoch 2/100
[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - loss: 6.5365e-05 - val_loss: 1.2456e-05
Epoch 3/100
[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - loss: 8.7444e-05 - val_loss: 1.2791e-05
Epoch 4/100
[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - loss: 5.2087e-05 - val_loss: 1.0042e-05
Epoch 5/100
[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - loss: 5.9408e-05 - val_loss: 1.8872e-05
Epoch 6/100
[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - loss: 4.5357e-05 - val_loss: 1.1298e-05
Epoch 7/100
[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 11ms/step - loss: 3.1619e-05 - val_loss: 1.1383e-05
Epoch 8/100
[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 

<keras.src.callbacks.history.History at 0x22287ea64e0>

In [21]:
# Make predictions
predictions = model.predict(X_test)
predictions = scaler.inverse_transform(predictions)

[1m611/611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step


In [22]:
predictions

array([[101.32367  , 101.22325  , 101.55544  ],
       [100.30066  , 100.21634  , 100.560974 ],
       [ 98.45176  ,  98.33214  ,  98.66827  ],
       ...,
       [ 15.859193 ,  15.853813 ,  15.760165 ],
       [ 15.8467045,  15.834369 ,  15.737604 ],
       [ 15.857397 ,  15.843631 ,  15.7476225]], dtype=float32)

In [23]:
from sklearn.metrics import mean_squared_error
# Get the actual values from y_test
y_test = scaler.inverse_transform(y_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE: {rmse}")

RMSE: 2.2754994435350335


In [24]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

mae = mean_absolute_error(y_test, predictions)
mape = mean_absolute_percentage_error(y_test, predictions)
print(f"MAE: {mae}")
print(f"MAPE: {mape}")

MAE: 0.9947485477818668
MAPE: 0.02742461649811966
