Data preprocessing

## Stock data Preparation
1. Manually pick the stock price data where  the price movement is more meaningful.
2. The testing data is from 2024-01-01 to 2024-12-31.

In [1]:
# import neccessary library
import pandas as pd
import os 
import numpy as np
import yfinance as yf
import plotly.express as px
import plotly.graph_objects as go
import json
from collections import defaultdict

# Define constant 
DATA_DIR = os.path.join(os.getcwd(), 'data')

# Open the config file
config_file_path = os.path.join(os.getcwd(), 'config.json')
with open(config_file_path) as f:
    config = json.load(f)

In [2]:
# Load stock price data and store them in a dictionary
from collections import defaultdict
training_stock_df = defaultdict(list)
stock_dict = config['stock_dict']

for stock, start_date in stock_dict.items():
    stock_price_dir = f'{DATA_DIR}/stock_price_data/training/{stock}_stock_price_data(training).xlsx'
    stock_data = pd.read_excel(stock_price_dir)
    training_stock_df[stock] = stock_data

In [3]:
# Plot all the stocks in one figure
def plot_all_stocks_in_one_figure(stock_price_df, stock_price_dict):
    fig = go.Figure()
    # Define a list of colors
    colors = ['#FF5733', '#33FF57', '#3357FF', '#F1C40F', '#8E44AD', '#E74C3C', '#1ABC9C', '#2ECC71', '#3498DB']
    for i, stock in enumerate(stock_price_dict):
        fig.add_trace(go.Scatter(x=stock_price_df['date'], y=stock_price_df[stock], mode='lines', name=stock, line=dict(color=colors[i % len(colors)])))
    
    fig.update_layout(
        title='Stock Prices Over Time (Training Data)',
        xaxis_title='Date',
        yaxis_title='Closing Price'
    )
    fig.show()

def stock_data_append(stock_price_df, stock_data):
    if stock_price_df.empty:
        stock_price_df = stock_data
    else:
        stock_price_df = pd.merge(stock_price_df, stock_data, on='date', how='outer')
    return stock_price_df

temp_training_stock_df = pd.DataFrame() # only used for visualization
for stock, stock_data in training_stock_df.items():
    temp_training_stock_df = stock_data_append(temp_training_stock_df, stock_data)
plot_all_stocks_in_one_figure(temp_training_stock_df, training_stock_df)

In [4]:
# Make sure the data in the dataframe does not contain any weird values, such as NaN, inf, etc.
def check_weird_data(data):
    if data.isnull().values.any() or data.isin([np.inf, -np.inf]).values.any():
        return True
    return False

for stock, stock_data in training_stock_df.items():
    if not check_weird_data(stock_data):
        print(f"The {stock} stock price data is ready for the next step")

The AMZN stock price data is ready for the next step
The GOOGL stock price data is ready for the next step
The MSFT stock price data is ready for the next step
The NVDA stock price data is ready for the next step
The AAPL stock price data is ready for the next step
The NFLX stock price data is ready for the next step
The AVGO stock price data is ready for the next step
The TSLA stock price data is ready for the next step
The META stock price data is ready for the next step


In [5]:
testing_stock_df = defaultdict(list)

for stock, start_date in stock_dict.items():
    stock_price_dir = f'{DATA_DIR}/stock_price_data/testing/{stock}_stock_price_data(testing).xlsx'
    stock_data = pd.read_excel(stock_price_dir)
    testing_stock_df[stock] = stock_data


temp_testing_stock_df = pd.DataFrame() # only used for visualization
for stock, stock_data in testing_stock_df.items():
    temp_testing_stock_df = stock_data_append(temp_testing_stock_df, stock_data)
plot_all_stocks_in_one_figure(temp_testing_stock_df, testing_stock_df)



In [6]:
# Check if the testing data is ready for the next step
for stock, stock_data in testing_stock_df.items():
    if not check_weird_data(stock_data):
        print(f"The {stock} stock price testing data is ready for the next step")

The AMZN stock price testing data is ready for the next step
The GOOGL stock price testing data is ready for the next step
The MSFT stock price testing data is ready for the next step
The NVDA stock price testing data is ready for the next step
The AAPL stock price testing data is ready for the next step
The NFLX stock price testing data is ready for the next step
The AVGO stock price testing data is ready for the next step
The TSLA stock price testing data is ready for the next step
The META stock price testing data is ready for the next step
