# **Exploratory Data Analysis**

Import Packages

In [1]:
import pandas as pd
pd.set_option('display.width', 1000)

import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# Visualization
import matplotlib.pyplot as plt

from torchinfo import summary

import xgboost as xgb
from xgboost import XGBRegressor

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


#--- Starting Here ---#

In [2]:
def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

In [7]:
NVDA = 'NVDA'
SEMICONDUCTORS = ["TSM", "AVGO", "AMD", "ASML", "MRVL", "ON"]

#--- Preface: Min Max Scaler is Incorrect as we cannot find a reason to justify that the maximum in any of the features has been achieved ---#
#--- Preface Continued: The RSI values range from 0 to 100, so positive values in a fixed range for regression ---#
# Setup semiconductor data using Nvidia as starting place
semiConductor_data = yf.download(NVDA, start='2010-01-01', end='2024-01-01')
semiConductor_data.columns = [col[0] for col in semiConductor_data.columns]
semiConductor_data['RSI'] = calculate_rsi(semiConductor_data['Close'], window=14)
semiConductor_data.reset_index(inplace=True) # Removes date as index
semiConductor_data['Ticker'] = NVDA
semiConductor_data = semiConductor_data[['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'Ticker', 'RSI']]
                                                            
#print(f"Total Data Length: {len(semiConductor_data)}")

for stock in SEMICONDUCTORS:

    stock_data = yf.download(stock, start="2010-01-01", end="2024-01-01")                                                                                          
    stock_data['Ticker'] = stock
    stock_data['RSI'] = calculate_rsi(stock_data['Close'], window=14)
    stock_data.reset_index(inplace=True)
    stock_data.columns = [col[0] for col in stock_data.columns]

    # Add new stock data with RSI
    semiConductor_data = pd.concat([semiConductor_data, stock_data], ignore_index=True)

semiConductor_data.drop(columns=['Date',], inplace=True)
semiConductor_data = semiConductor_data.dropna()
semiConductor_data.to_csv("semiconductorData.csv", index=False)

print([ i for i in semiConductor_data['Ticker'].unique()])

stock_symbols = SEMICONDUCTORS + [NVDA]
for stock in stock_symbols:
    print(f"{stock} rows: {len(semiConductor_data.loc[semiConductor_data['Ticker'] == stock])}")

semiConductor_data['Ticker'], conversion = pd.factorize(semiConductor_data['Ticker'])

#'RSI', 'Close', 'High', 'Low', 'Open', 'Volume'
print(semiConductor_data.columns)
print(semiConductor_data.dtypes)
print(f"Conversion: {conversion}")
semiConductor_data.head(5)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


['NVDA', 'TSM', 'AVGO', 'AMD', 'ASML', 'MRVL', 'ON']
TSM rows: 3509
AVGO rows: 3509
AMD rows: 3509
ASML rows: 3509
MRVL rows: 3509
ON rows: 3509
NVDA rows: 3509
Index(['Close', 'High', 'Low', 'Open', 'Volume', 'Ticker', 'RSI'], dtype='object')
Close     float64
High      float64
Low       float64
Open      float64
Volume      int64
Ticker      int64
RSI       float64
dtype: object
Conversion: Index(['NVDA', 'TSM', 'AVGO', 'AMD', 'ASML', 'MRVL', 'ON'], dtype='object')


Unnamed: 0,Close,High,Low,Open,Volume,Ticker,RSI
13,0.377381,0.39343,0.375088,0.386322,1067916000,0,24.688339
14,0.3838,0.391137,0.38082,0.383571,643676000,0,29.603827
15,0.371649,0.384488,0.37142,0.381966,714636000,0,21.978114
16,0.381737,0.383112,0.367293,0.37142,812504000,0,27.104812
17,0.368898,0.386093,0.363624,0.384717,687764000,0,26.087047


# **Processing**

- create different datasets for different companies
- History sequence length: 60 days in advance
- Output sequence length: 5 days in future

Creating LSTM Dataset

In [25]:
class LSTMdataset(Dataset):
    def __init__(self, data, sequence_length, output_length):
        self.data = data

        self.targetIDX = data.columns.tolist().index('RSI')
        self.sequence_length = sequence_length
        self.output_length = output_length
        self.sample_length = sequence_length + output_length

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, dayIDX):
        self.history = self.data[dayIDX : dayIDX + self.sequence_length]
        self.forecast = self.data[dayIDX + self.sequence_length : dayIDX + self.sample_length, self.targetIDX]

        return self.history, self.forecast

In [26]:
stock_loaders = {}

stocks = semiConductor_data['Ticker'].unique()
#-- INFO --#
shuffle = True
batch_size = 32

for stock in stocks:
    stock_data = semiConductor_data.loc[semiConductor_data['Ticker'] == stock]
    stock_data = stock_data.drop(columns='Ticker')
    lstmdataset = LSTMdataset(data=stock_data, sequence_length=60, output_length=5)
    stock_loader = DataLoader(lstmdataset, batch_size, shuffle)

    stock_loaders[stock] = stock_loader

print(f"Dataloaders: {len(stock_loaders)}")

Dataloaders: 7


Create Model

In [29]:
class LSTMRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LSTMRegression, self).__init__()

        # ----- LAYERS ----- #
        self.lstm = nn.Sequential(
            nn.LSTM(input_size, 256),
            nn.LSTM(256, 128)
        )

        self.dense = nn.Sequential(
            nn.Linear(128, 64),
            nn.Linear(64, 5)
        )

        self.batch_norm = nn.Sequential(        
            # -- LSTM LAYER -- #
            nn.BatchNorm1d(input_size),

            # -- DENSE LAYERS -- #
            nn.BatchNorm1d(128),
            nn.BatchNorm1d(64)
        )

        # ----- ACTIVATION FUNCTIONS ----- #
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

        # ----- FUNCTIONS ----- #
        self.dropout2 = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)

    def forward(self, x):

        out = self.batch_norm[0](x)
        out, _ = self.lstm[0](out)
        out, _ = self.lstm[1](out)

        out = self.dense[0](out)
        out = self.relu(out)
        #out = self.batch_norm[1](out)

        out = self.dense[1](out)
        out = self.relu(out)
        #out = self.batch_norm[2](out)

        return out

In [31]:
model = LSTMRegression(input_size=6, output_size=5).to(device)
optimizer = optim.Adam(params=model.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [None]:
epochs = 30
test_loaders = []

for sym, loader in stock_loaders.items():
    test_loader = []
    train, test = torch.utils.data.random_split(loader,lengths=[0.8, 0.2])
    test_loader.append(test)
    test_loaders.append(test_loader)

    for 
