# Stock Prediction


## Import libraries


In [None]:
import os
import pandas as pd
from pandas import DataFrame
import numpy as np
from tqdm import tqdm

import torch
from torch import Tensor
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

from typing import Tuple, List, Optional, Union

## Prepare data


### Load data


In [None]:
dfs = []

# read
for name in os.listdir("./data/"):
    df = pd.read_csv(os.path.join("./data", name))
    df["Date/Time"] = pd.to_datetime(df["Date/Time"], format="%m/%d/%Y %H:%M")

    dfs.append(df)

    print(name)
    display(df.describe())

There's nothing noticeable excepts the values of `Open Interest` column are always 0


### Preprocess data


**1. Drop `Open Interest` column**

As we described above, the `Open Interest` doesn't take any useful data for training, so we will drop it for saving resources.


In [None]:
for i in range(len(dfs)):
    dfs[i] = dfs[i].drop("Open Interest", axis=1)

**2. Missing timestamp**

There are some missing timestamps in the input data. The dataset should contain all stock data in each minute between 9:00 to 11:29 and 13:00 to 14:46 from Monday to Friday, but it is often missed such as following example:

```csv
Ticker,Date/Time,Open,High,Low,Close,Volume,Open Interest
FPT,12/25/2018 9:15,30.89,30.89,30.89,30.89,35410,0
FPT,12/25/2018 9:16,30.81,30.81,30.81,30.81,190,0
FPT,12/25/2018 9:17,30.74,30.81,30.74,30.74,1120,0
FPT,12/25/2018 9:18,30.74,30.74,30.74,30.74,2120,0
FPT,12/25/2018 9:19,30.74,30.74,30.74,30.74,22500,0
FPT,12/25/2018 9:20,30.74,30.74,30.7,30.74,7140,0
FPT,12/25/2018 9:21,30.66,30.74,30.59,30.66,16480,0
```

Therefore, whatever the reason for data loss, we assume that stock market always opens during the above mentioned time periods. We will fill missing timestamp in existing day (a day which have availably had trading data) by forward filling method.


In [None]:
def create_minute_range(date):
    morning_range = pd.date_range(
        start=f"{date} 09:15", end=f"{date} 11:29", freq="1min"
    )
    afternoon_range = pd.date_range(
        start=f"{date} 13:00", end=f"{date} 14:46", freq="1min"
    )
    return morning_range.union(afternoon_range)

In [None]:
def filling_missing_timestamp(df: DataFrame) -> DataFrame:
    # Set 'Date/Time' as the index
    df.set_index("Date/Time", inplace=True)
    df.sort_index(inplace=True)

    # Get unique dates in the dataset
    unique_dates = pd.Series(df.index.date).unique()

    all_minutes = []
    for date in unique_dates:
        all_minutes.extend(create_minute_range(date))

    # Reindex the dataframe with the new index
    new_index = pd.DatetimeIndex(all_minutes)
    df_filled = df.reindex(new_index)

    # Forward fill the missing values
    df_filled = df_filled.ffill()
    if df_filled.isna().any().any():
        df_filled = df_filled.bfill()

    # Reset the index to make 'Date/Time' a column again
    df_filled.reset_index(inplace=True)
    df_filled.rename(columns={"index": "Date/Time"}, inplace=True)

    return df_filled

In [None]:
# filling
for i in range(len(dfs)):
    dfs[i] = filling_missing_timestamp(dfs[i])

**3. One - hot encoding `Ticker` column**

Because of our goal is developing a general model to predict stock price movement for all stocks, we have to transform column `Ticker` to numerical for training model later


In [None]:
def onehot_encode_dataframe(
    df: DataFrame, column_name: Optional[str] = None, categories: Optional[List] = None
) -> DataFrame:
    if (column_name and categories is None) or (column_name is None and categories):
        raise ValueError(
            "Either both 'column_name' and 'categories' must be provided, or neither should be provided."
        )

    dummy_df = df.copy()
    if column_name:
        column_df = pd.DataFrame({column_name: categories})
        dummy_df = pd.concat([dummy_df, column_df], axis=0, ignore_index=True)

    dummy_df = pd.get_dummies(dummy_df)

    if column_name:
        return dummy_df.iloc[: -len(categories)]

    return dummy_df

In [None]:
ticker_list = ["FPT", "MSN", "PNJ", "VIC"]
for i in range(len(dfs)):
    dfs[i] = onehot_encode_dataframe(
        dfs[i], column_name="Ticker", categories=ticker_list
    )

**4. Other stuffs**

Finally, we have to do some small preprocessing for further processing such as drop `Date/Time` columns and astype to `float`

In [None]:
for i in range(len(dfs)):
    # drop `Date/Time`
    dfs[i] = dfs[i].drop("Date/Time", axis=1)

    # astype
    dfs[i] = dfs[i].astype(np.float32)

### Prepare features, label


In [None]:
def slicing_window(
    df: DataFrame,
    label_name: Union[str, List[str]],
    start_idx: int = 0,
    input_size: int = 30,
    offset: int = 1,
    end_idx: Optional[int] = None,
    label_size: Optional[int] = None,
) -> Tuple[List, List]:

    features = []
    labels = []

    start_idx += input_size + offset
    if end_idx:
        end_idx = len(df) - label_size - offset

    for idx in range(start_idx, end_idx):
        feature_start_idx = idx - input_size - offset
        feature_end_idx = idx - offset

        feature = df.loc[feature_start_idx:feature_end_idx, :]
        label = df.loc[feature_start_idx:feature_end_idx, label_name]

        features.append(feature)
        labels.append(label)

    return features, labels

In [None]:
# X_trains = []
# y_trains = []
# X_vals = []
# y_vals = []
# X_tests = []
# y_tests = []

# for i in range(len(dfs)):
#     train_end_idx = int(TRAIN_SIZE * len(dfs[i]))
#     val_end_idx = int(VAL_SIZE * len(dfs[i])) + train_end_idx

#     X_train, y_train = slicing_window(
#         dfs[i],
#         label_name=LABEL_NAMES,
#         start_idx=0,
#         input_size=INPUT_SIZE,
#         end_idx=train_end_idx,
#         label_size=TIMESTAMP_PER_DAY, # predict to next day
#     )

#     # X_val, y_val = slicing_window(
#     #     dfs[i],
#     #     label_name=LABEL_NAMES,
#     #     start_idx=train_end_idx,
#     #     input_size=INPUT_SIZE,
#     #     end_idx=val_end_idx,
#     #     label_size=TIMESTAMP_PER_DAY
#     # )

#     # X_test, y_test = slicing_window(
#     #     dfs[i],
#     #     label_name=LABEL_NAMES,
#     #     start_idx=val_end_idx,
#     #     input_size=INPUT_SIZE,
#     #     end_idx=len(dfs[i]),
#     #     label_size=TIMESTAMP_PER_DAY
#     # )

#     X_trains.append(X_train)
#     # X_vals.append(X_val)
#     # X_test.append(X_test)
#     y_trains.append(y_train)
#     # y_vals.append(y_val)
#     # y_tests.append(y_test)

## Dataset


In [None]:
class StockDataset(Dataset):
    def __init__(
        self,
        df: DataFrame,
        label_name: Union[str, List[str]],
        start_idx: int = 0,
        input_size: int = 30,
        offset: int = 1,
        end_idx: Optional[int] = None,
        label_size: Optional[int] = None,
    ) -> None:
        super().__init__()

        self.df = df
        self.start_idx = start_idx
        self.input_size = input_size
        self.offset = offset
        self.end_idx = end_idx
        self.label_size = label_size
        self.label_name = label_name
        if isinstance(self.label_name, str):
            self.label_name = [self.label_name]

    def __len__(self):
        return self.end_idx - self.label_size - self.offset - self.start_idx
    
    def __getitem__(self, idx):
        start_x = self.start_idx + idx
        end_x = start_x + self.input_size
        start_y = end_x + self.offset
        end_y = start_y + self.label_size

        features = torch.tensor(self.df.iloc[start_x:end_x].values, dtype=torch.float32)
        labels = torch.tensor(self.df.loc[start_y:end_y-1, self.label_name].values, dtype=torch.float32)

        return features, labels

In [None]:
TIMESTAMP_PER_DAY = 242
INPUT_SIZE = 3 * TIMESTAMP_PER_DAY  # 3 days
OFFSET = 1
LABEL_NAMES = dfs[0].columns.tolist()

TRAIN_SIZE = 0.7
VAL_SIZE = 0.2
TEST_SIZE = 0.1

BATCH_SIZE = 64

**Load dataset**

In [None]:
train_dataset_list = []
val_dataset_list = []
test_dataset_list = []

for i in range(len(dfs)):
    train_end_idx = int(TRAIN_SIZE * len(dfs[i]))
    val_end_idx = int(VAL_SIZE * len(dfs[i])) + train_end_idx
    
    train_dataset = StockDataset(
        dfs[i],
        label_name=LABEL_NAMES,
        start_idx=0,
        input_size=INPUT_SIZE,
        end_idx=train_end_idx,
        label_size=TIMESTAMP_PER_DAY, # predict to next day
    )

    val_dataset = StockDataset(
        dfs[i],
        label_name=LABEL_NAMES,
        start_idx=train_end_idx,
        input_size=INPUT_SIZE,
        end_idx=val_end_idx,
        label_size=TIMESTAMP_PER_DAY
    )

    test_dataset = StockDataset(
        dfs[i],
        label_name=LABEL_NAMES,
        start_idx=val_end_idx,
        input_size=INPUT_SIZE,
        end_idx=len(dfs[i]),
        label_size=TIMESTAMP_PER_DAY
    )

    train_dataset_list.append(train_dataset)
    val_dataset_list.append(val_dataset)
    test_dataset_list.append(test_dataset)

## Model

### Seq2Seq

#### Define model

In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 hidden_size: int,
                 num_layers: int) -> None:
        super().__init__()

        self.lstm = nn.LSTM(input_dim,
                           hidden_size,
                           num_layers,
                           batch_first=True)
        
    def forward(self, x):
        output, (hidden_state, cell_state) = self.lstm(x)

        return hidden_state, cell_state

In [None]:
class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 hidden_size: int,
                 num_layers: int):
        super().__init__()

        self.lstm = nn.LSTM(output_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_dim)

    def forward(self, x, hidden, cell):
        outputs, (hidden, cell) = self.lstm(x, (hidden, cell))
        preds = self.fc(outputs)

        return preds, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self,
                source: Tensor,
                target: Tensor, 
                teaching_force_ratio: float = 0.5):
        batch_size, tg_len, tg_dim = target.shape

        # we have to push `outputs` tensor to device manually because of not being model parameters
        outputs = torch.zeros(batch_size, tg_len, tg_dim).to(source.device)

        hidden, cell = self.encoder(source)

        # get the first true label (unsqueeze to make the shape right)
        input = source[:, 0, :].unsqueeze(1)

        for t in range(1, tg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output.squeeze(1)

            teaching_force = np.random.rand() > teaching_force_ratio
            input = target[:, t, :] if teaching_force else output
            if input.dim() < 3:
                input = input.unsqueeze(1)

        return outputs

In [None]:
INPUT_DIM = len(LABEL_NAMES)
HIDDEN_SIZE = 50
OUTPUT_DIM = INPUT_DIM
NUM_LAYERS = 2

encoder = Encoder(INPUT_DIM, HIDDEN_SIZE, NUM_LAYERS)
decoder = Decoder(OUTPUT_DIM, HIDDEN_SIZE, NUM_LAYERS)
model = Seq2Seq(encoder, decoder)

#### Training

In [None]:
device = ('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
epochs = 10
model.to(device)
for i, (train_set, val_set) in enumerate(zip(train_dataset_list, val_dataset_list)):
    print(f'Stock {i + 1}')
    train_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False)
    val_dataloader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

    # train_running_loss = {'epoch': [],
    #                       'loss': []}
    # train_rmse = {'epoch': [],
    #               'rmse': []}

    for e in range(epochs):
        print(f'Epoch {e + 1}')
        running_loss = 0.0

        model.train()
        train_loop = tqdm(train_dataloader, desc=f'{"Train":^11}', leave=True)        
        for b, data in enumerate(train_loop):
            X, y = (_.to(device) for _ in data)

            y_pred = model(X, y)

            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            running_loss += loss.item()
            logging_dict = {
                'Loss': running_loss / (b + 1)
            }
            train_loop.set_postfix(logging_dict)

        # model.eval()
        # eval_loop = tqdm(val_dataloader, desc=f'{"Eval":^7}', leave=True)
        