# **Exploratory Data Analysis**

Import Packages

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import nasdaqdatalink

# Machine Learning Models
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
    VotingRegressor, StackingRegressor, BaggingRegressor
)
import xgboost as xgb
from xgboost import XGBRegressor

import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
from torch.utils.data import Dataset, DataLoader

#--- Starting Here ---#

In [2]:
def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

In [7]:
NVDA = 'NVDA'
SEMICONDUCTORS = ["TSM", "AVGO", "AMD", "ASML", "MRVL", "ON"]
scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling RSI values for more relatable trends

semiConductor_data = yf.download(NVDA, start='2010-01-01', end='2024-01-01')
semiConductor_data['Ticker'] = NVDA
semiConductor_data['RSI'] = calculate_rsi(semiConductor_data['Close'], window=14)
semiConductor_data['RSI_Scaled'] = scaler.fit_transform(semiConductor_data['RSI'].values.reshape(-1, 1))
semiConductor_data.reset_index(drop=False, inplace=True)
semiConductor_data = semiConductor_data.dropna()

for stock in SEMICONDUCTORS:
    stock_data = yf.download(stock, start="2020-01-01", end="2024-01-01")
    stock_data['Ticker'] = stock
    stock_data['RSI'] = calculate_rsi(stock_data['Close'], window=14)
    stock_data['RSI_Scaled'] = scaler.fit_transform(stock_data['RSI'].values.reshape(-1, 1))
    stock_data.reset_index(drop=False, inplace=True)
    stock_data = stock_data.dropna()
    semiConductor_data = pd.concat([semiConductor_data, stock_data], ignore_index=True)

semiConductor_data.to_csv("semiconductorData.csv", index=False)
print(semiConductor_data.head(5))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Price        Date     Close      High       Low      Open        Volume  \
Ticker                 NVDA      NVDA      NVDA      NVDA          NVDA   
0      2010-01-22  0.377381  0.393429  0.375088  0.386322  1.067916e+09   
1      2010-01-25  0.383800  0.391137  0.380820  0.383571  6.436760e+08   
2      2010-01-26  0.371649  0.384488  0.371419  0.381966  7.146360e+08   
3      2010-01-27  0.381737  0.383112  0.367293  0.371420  8.125040e+08   
4      2010-01-28  0.368897  0.386093  0.363624  0.384717  6.877640e+08   

Price  Ticker        RSI RSI_Scaled Close  ...      High  Low Open Volume  \
Ticker                                TSM  ... MRVL MRVL MRVL MRVL   MRVL   
0        NVDA  24.688356   0.225087   NaN  ...  NaN  NaN  NaN  NaN    NaN   
1        NVDA  29.603792   0.276050   NaN  ...  NaN  NaN  NaN  NaN    NaN   
2        NVDA  21.978053   0.196987   NaN  ...  NaN  NaN  NaN  NaN    NaN   
3        NVDA  27.104761   0.250140   NaN  ...  NaN  NaN  NaN  NaN    NaN   
4        NVD

In [None]:
def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

nvidia['RSI'] = calculate_rsi(nvidia['Close'], window=14)
nvidia = nvidia.dropna()

nvidia = nvidia.reset_index()
nvidia['Date'] = pd.to_datetime(nvidia['Date'])

print(nvidia.dtypes)
nvidia.head(10)

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
nvidia['RSIScaled'] = scaler.fit_transform(nvidia['RSI'].values.reshape(-1, 1))

features = [col[0] for col in nvidia.columns.values if col[0] != "RSI" and col[0] != "RSIScaled"]
target = "RSIScaled"
leftover_feature = "RSI"

input_size = 30
output_size = 10

print(nvidia.head(5))
print(f"Target Features: {features}")
print(f"Target: {target}")
print(f"LeftOver Feature: {leftover_feature}")

In [None]:
class lstmDataset(Dataset):
    def __init__(self, data, input_window=30, output_window=7, batch_size = 32, shuffle=True):
        self.data = torch.FloatTensor(data)
        self.input_window = input_window
        self.output_window = output_window
    
    def __len__(self):
        