# 1. Install Required Libraries
Ensure you have the necessary libraries installed, including gensim, spacy, torch, and scikit-learn.

In [1]:
!pip install --upgrade pip
!pip install gensim spacy torch scikit-learn
!pip install kaggle

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Using cached gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Collecting spacy
  Using cached spacy-3.8.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting torch
  Using cached torch-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7

# 2. Load and Preprocess the Dataset

- Download the stock market dataset.

In [2]:
# Kaggle Authentification
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Downloading the stock market dataset
!kaggle datasets download -d jacksoncrow/stock-market-dataset

# Unzip
# !unzip -q stock-market-dataset.zip -d stock_data
!unzip -o -q stock-market-dataset.zip -d stock_data # -o force l'ecrasement


Dataset URL: https://www.kaggle.com/datasets/jacksoncrow/stock-market-dataset
License(s): CC0-1.0
stock-market-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


- Drop unnecessary columns and create a target column for the next day’s closing price.

To begin the analysis, we chose to work with a single CSV file from the dataset: AAPL.csv, which contains historical stock prices for Apple Inc.

In [7]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: tzdata, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pandas]2m1/2[0m [pandas]
[1A[2KSuccessfully installed pandas-2.3.1 tzdata-2025.2


In [24]:
import pandas as pd

df = pd.read_csv("stock_data/stocks/AAPL.csv")
print(df.head())
print(df.info())

         Date      Open      High       Low     Close  Adj Close     Volume
0  1980-12-12  0.513393  0.515625  0.513393  0.513393   0.406782  117258400
1  1980-12-15  0.488839  0.488839  0.486607  0.486607   0.385558   43971200
2  1980-12-16  0.453125  0.453125  0.450893  0.450893   0.357260   26432000
3  1980-12-17  0.462054  0.464286  0.462054  0.462054   0.366103   21610400
4  1980-12-18  0.475446  0.477679  0.475446  0.475446   0.376715   18362400
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9909 entries, 0 to 9908
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       9909 non-null   object 
 1   Open       9909 non-null   float64
 2   High       9909 non-null   float64
 3   Low        9909 non-null   float64
 4   Close      9909 non-null   float64
 5   Adj Close  9909 non-null   float64
 6   Volume     9909 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 542.0+ KB
None


In [25]:
## Drop unnecessary columns
df = df.drop(columns=['Adj Close'])  # often close to 'Close', so not necessary

# Convert 'Date' to datetime 
df['Date'] = pd.to_datetime(df['Date'])

# Class the DataFrame by date 
df = df.sort_values('Date').reset_index(drop=True)

## Create a target column for the next day’s closing price
df['Target'] = df['Close'].shift(-1)

# Drop last line (NaN) 
df = df.dropna().reset_index(drop=True)

# Verification
print(df[['Date', 'Close', 'Target']].head())


        Date     Close    Target
0 1980-12-12  0.513393  0.486607
1 1980-12-15  0.486607  0.450893
2 1980-12-16  0.450893  0.462054
3 1980-12-17  0.462054  0.475446
4 1980-12-18  0.475446  0.504464


- Normalize the dataset using MinMaxScaler.

In [26]:
from sklearn.preprocessing import MinMaxScaler

# Select the columns
feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
target_col = 'Target'

# Initialize the scaler
scaler = MinMaxScaler()

# Apply the scaler on the features
df_scaled = df.copy()
df_scaled[feature_cols] = scaler.fit_transform(df[feature_cols])

# Print a part
print(df_scaled[feature_cols + [target_col]].head())


       Open      High       Low     Close    Volume    Target
0  0.000970  0.000967  0.000981  0.000969  0.063023  0.486607
1  0.000894  0.000886  0.000898  0.000887  0.023516  0.450893
2  0.000784  0.000777  0.000787  0.000778  0.014061  0.462054
3  0.000812  0.000811  0.000822  0.000812  0.011462  0.475446
4  0.000853  0.000852  0.000863  0.000853  0.009711  0.504464


# 3. Prepare the Dataset for Training
- Split the dataset into training, validation, and testing sets.

In [27]:
# Total size dataset
n = len(df_scaled)

# Percentage of split
train_size = int(n * 0.7)
val_size = int(n * 0.15)
test_size = n - train_size - val_size  # the rest

# Split of the data
train_df = df_scaled.iloc[:train_size]
val_df = df_scaled.iloc[train_size:train_size + val_size]
test_df = df_scaled.iloc[train_size + val_size:]

# Verification
print("Train size :", len(train_df))
print("Validation size :", len(val_df))
print("Test size :", len(test_df))


Train size : 6935
Validation size : 1486
Test size : 1487


- Create a custom PyTorch Dataset class to handle the data.

In [28]:
import torch
from torch.utils.data import Dataset

class StockDataset(Dataset):
    def __init__(self, data, feature_cols, target_col, sequence_length=30):
        self.sequence_length = sequence_length
        self.features = data[feature_cols].values.astype('float32')
        self.targets = data[target_col].values.astype('float32')

    def __len__(self):
        # nombre of possible sequence
        return len(self.features) - self.sequence_length

    def __getitem__(self, idx):
        # X sequence : 30 days from idx
        X = self.features[idx:idx + self.sequence_length]

        # y = target right after the sequence
        y = self.targets[idx + self.sequence_length]

        return torch.tensor(X), torch.tensor(y)


In [29]:
feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
target_col = 'Target'

train_dataset = StockDataset(train_df, feature_cols, target_col, sequence_length=30)
val_dataset = StockDataset(val_df, feature_cols, target_col, sequence_length=30)
test_dataset = StockDataset(test_df, feature_cols, target_col, sequence_length=30)

# Example
x, y = train_dataset[0]
print("X shape:", x.shape) 
print("y:", y)


X shape: torch.Size([30, 5])
y: tensor(0.5536)


In [30]:
from torch.utils.data import DataLoader

# Parameters
batch_size = 64

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [31]:
# Tester un batch
for x_batch, y_batch in train_loader:
    print("X batch shape:", x_batch.shape)  # (batch_size, sequence_length, nb_features)
    print("y batch shape:", y_batch.shape)  # (batch_size,)
    break  # we display the first batch


X batch shape: torch.Size([64, 30, 5])
y batch shape: torch.Size([64])


# 4. Define the LSTM Model
- Create an LSTM model using PyTorch.
- Define the model architecture, including GRU layers, dropout, and a dense layer.

In [32]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size=5, hidden_size=64, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,
                            dropout=dropout)

        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x: (batch_size, sequence_length, input_size)
        batch_size = x.size(0)

        # Initial hidden & cell state (num_layers, batch_size, hidden_size)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)

        # LSTM output
        out, _ = self.lstm(x, (h0, c0))  # out: (batch, seq_len, hidden)

        # On prend uniquement la dernière sortie de la séquence (t = -1)
        out = out[:, -1, :]  # (batch_size, hidden_size)

        # Prediction
        out = self.fc(out)  # (batch_size, 1)
        return out.squeeze()  # (batch_size,)


In [33]:
model = LSTMModel(input_size=5, hidden_size=64, num_layers=2)
print(model)


LSTMModel(
  (lstm): LSTM(5, 64, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)


# 5. Train the Model
- Set up the optimizer and loss function.


In [34]:
import torch.optim as optim
import torch.nn as nn

# Loss function: regression → Mean Squared Error
criterion = nn.MSELoss()

# Optimizer: Adam is good default
optimizer = optim.Adam(model.parameters(), lr=0.001)


- Implement training and validation loops.


In [35]:
import torch

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [36]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * X_batch.size(0)

        train_loss /= len(train_loader.dataset)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item() * X_batch.size(0)

        val_loss /= len(val_loader.dataset)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


- Train the model for a specified number of epochs.

In [37]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20)


Epoch 1/20 | Train Loss: 21.9369 | Val Loss: 2875.3448
Epoch 2/20 | Train Loss: 19.8791 | Val Loss: 2691.6588
Epoch 3/20 | Train Loss: 8.1401 | Val Loss: 2009.0984
Epoch 4/20 | Train Loss: 2.6983 | Val Loss: 1725.0664
Epoch 5/20 | Train Loss: 1.2422 | Val Loss: 1567.7651
Epoch 6/20 | Train Loss: 0.7682 | Val Loss: 1470.4648
Epoch 7/20 | Train Loss: 0.5811 | Val Loss: 1401.2042
Epoch 8/20 | Train Loss: 0.4781 | Val Loss: 1345.5473
Epoch 9/20 | Train Loss: 0.4774 | Val Loss: 1305.1665
Epoch 10/20 | Train Loss: 0.3712 | Val Loss: 1271.9547
Epoch 11/20 | Train Loss: 0.2934 | Val Loss: 1250.5474
Epoch 12/20 | Train Loss: 0.2817 | Val Loss: 1227.9176
Epoch 13/20 | Train Loss: 0.2803 | Val Loss: 1214.7775
Epoch 14/20 | Train Loss: 0.2962 | Val Loss: 1233.8938
Epoch 15/20 | Train Loss: 0.3183 | Val Loss: 1237.3802
Epoch 16/20 | Train Loss: 0.1944 | Val Loss: 1189.5087
Epoch 17/20 | Train Loss: 0.2122 | Val Loss: 1212.5697
Epoch 18/20 | Train Loss: 0.2243 | Val Loss: 1169.4932
Epoch 19/20 | Tra

# 6. Evaluate the Model
- Calculate the R² score to evaluate the model’s performance on the test set.
- Save the scaler object for future predictions.

In [38]:
from sklearn.metrics import r2_score

def evaluate_model(model, test_loader, scaler_target=None):
    model.eval()
    predictions = []
    true_values = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model(X_batch)
            predictions.extend(outputs.cpu().numpy())
            true_values.extend(y_batch.cpu().numpy())

    # Optional: inverse transform if target was scaled
    if scaler_target:
        predictions = scaler_target.inverse_transform([[p] for p in predictions])
        true_values = scaler_target.inverse_transform([[t] for t in true_values])

    # Flatten
    predictions = [p[0] if isinstance(p, list) else p for p in predictions]
    true_values = [t[0] if isinstance(t, list) else t for t in true_values]

    r2 = r2_score(true_values, predictions)
    print(f"Test R² Score: {r2:.4f}")
    return predictions, true_values


In [39]:
predictions, true_values = evaluate_model(model, test_loader, scaler_target=None)


Test R² Score: -5.9656


In [40]:
import joblib

# We normalized the features
joblib.dump(scaler, "feature_scaler.pkl")

['feature_scaler.pkl']

In [41]:
scaler = joblib.load("feature_scaler.pkl")