# Stock Prediction

## Import libraries

In [1]:
import os
import pandas as pd
from pandas import DataFrame
import numpy as np

from typing import Tuple, List, Optional

## Prepare data

### Load data

In [4]:
dfs = []
df_len = []

# read
for name in os.listdir('./data/'):
    df = pd.read_csv(os.path.join('./data', name))
    df['Date/Time'] = pd.to_datetime(df['Date/Time'], format='%m/%d/%Y %H:%M')
    
    df_len.append(len(df))
    dfs.append(df)

Unnamed: 0,Date/Time,Open,High,Low,Close,Volume,Open Interest,Ticker_FPT,Ticker_MSN,Ticker_PNJ,Ticker_VIC
0,2018-10-17 09:39:00,98.5,98.5,98.5,98.5,390,0,False,False,False,True
1,2018-10-17 09:40:00,98.5,98.5,98.5,98.5,760,0,False,False,False,True
2,2018-10-17 09:41:00,98.6,98.6,98.6,98.6,2060,0,False,False,False,True
3,2018-10-17 09:42:00,98.6,98.6,98.6,98.6,5540,0,False,False,False,True
4,2018-10-17 09:43:00,98.6,98.7,98.6,98.7,760,0,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
97401,2020-12-22 14:24:00,58.2,58.2,58.2,58.2,27470,0,True,False,False,False
97402,2020-12-22 14:25:00,58.2,58.2,58.2,58.2,14820,0,True,False,False,False
97403,2020-12-22 14:26:00,58.2,58.2,58.2,58.2,500,0,True,False,False,False
97404,2020-12-22 14:29:00,58.1,58.1,58.1,58.1,2500,0,True,False,False,False


## Preprocess data

**1. Missing timestamp**

There are some missing timestamps in the input data. The dataset should contain all stock data in each minute between 9:00 to 11:29 and 13:00 to 14:46 from Monday to Friday, but it is often missed such as following example:
```csv
Ticker,Date/Time,Open,High,Low,Close,Volume,Open Interest
FPT,12/25/2018 9:15,30.89,30.89,30.89,30.89,35410,0
FPT,12/25/2018 9:16,30.81,30.81,30.81,30.81,190,0
FPT,12/25/2018 9:17,30.74,30.81,30.74,30.74,1120,0
FPT,12/25/2018 9:18,30.74,30.74,30.74,30.74,2120,0
FPT,12/25/2018 9:19,30.74,30.74,30.74,30.74,22500,0
FPT,12/25/2018 9:20,30.74,30.74,30.7,30.74,7140,0
FPT,12/25/2018 9:21,30.66,30.74,30.59,30.66,16480,0
```
Therefore, whatever the reason for data loss, we assume that stock market always opens during the above mentioned time periods. We will fill missing timestamp in existing day (a day which have availably had trading data) by forward filling method.

In [None]:
def create_minute_range(date):
    morning_range = pd.date_range(start=f"{date} 09:15", end=f"{date} 11:29", freq='1min')
    afternoon_range = pd.date_range(start=f"{date} 13:00", end=f"{date} 14:46", freq='1min')
    return morning_range.union(afternoon_range)

In [None]:
def filling_missing_timestamp(df: DataFrame) -> DataFrame:
    # Set 'Date/Time' as the index
    df.set_index('Date/Time', inplace=True)
    df.sort_index(inplace=True)

    # Get unique dates in the dataset
    unique_dates = pd.Series(df.index.date).unique()

    all_minutes = []
    for date in unique_dates:
        all_minutes.extend(create_minute_range(date))

    # Reindex the dataframe with the new index
    new_index = pd.DatetimeIndex(all_minutes)
    df_filled = df.reindex(new_index)

    # Forward fill the missing values
    df_filled = df_filled.ffill()

    # Reset the index to make 'Date/Time' a column again
    df_filled.reset_index(inplace=True)
    df_filled.rename(columns={'index': 'Date/Time'}, inplace=True)

    return df_filled

In [None]:
# filling
filled_dfs = []
for df in dfs:
    filled_df = filling_missing_timestamp(df)

    filled_dfs.append(filled_df)

# delete the old one
del dfs

**2. One - hot encoding**

Because of our goal is developing a general model to predict stock price movement for all stocks, we have to transform column `Ticker` to numerical for training model later

In [5]:
ticker_list = ['FPT', 'MSN', 'PNJ', 'VIC']
ticker_df = pd.DataFrame(ticker_list)

def onehot_ticker(df: DataFrame) -> DataFrame:
    dummy_df = pd.concat([df, ticker_df], axis=0, ignore_index=True)
    dummy_df = pd.get_dummies(dummy_df)

    return dummy_df.loc[:-4, :]

## Prepare data

In [None]:
def slicing_window(df: DataFrame,
                   label_name: str,
                   start_idx: int = 0,
                   input_size: int = 30,
                   offset: int = 1,
                   end_idx: Optional[int] = None,
                   label_size: Optional[int] = None) -> Tuple[List, List]:
    
    features = []
    labels = []
    
    start_idx += input_size + offset
    if end_idx:
        end_idx = len(df) - label_size - offset
    
    for idx in range(start_idx, end_idx):
        feature_start_idx = idx - input_size - offset
        feature_end_idx = idx - offset

        feature = df.loc[feature_start_idx:feature_end_idx, :]
        label = df.loc[feature_start_idx:feature_end_idx, label_name]

        features.append(feature)
        label.append(label)

    return features, labels