<a href="https://www.kaggle.com/code/simronw/nvidia-stock-market-predictions?scriptVersionId=211872119" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import statsmodels.api as sm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nvidia-stock-data-2024/NVDA_1999-01-01_2024-12-04.csv


# **Data Preprocessing**

In [2]:
# Importing dataset into pandas dataframe
stock_df = pd.read_csv("/kaggle/input/nvidia-stock-data-2024/NVDA_1999-01-01_2024-12-04.csv", parse_dates=['Date'])

In [3]:
# Displaying first 5 rows from the dataset
stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1999-01-22,0.04375,0.048828,0.038802,0.041016,0.017325,2714688000
1,1999-01-25,0.044271,0.045833,0.041016,0.045313,0.01914,510480000
2,1999-01-26,0.045833,0.046745,0.041146,0.041797,0.017655,343200000
3,1999-01-27,0.041927,0.042969,0.039583,0.041667,0.0176,244368000
4,1999-01-28,0.041667,0.041927,0.041276,0.041536,0.017544,227520000


In [4]:
# Displaying the datatype of columns
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6509 entries, 0 to 6508
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6509 non-null   datetime64[ns]
 1   Open       6509 non-null   float64       
 2   High       6509 non-null   float64       
 3   Low        6509 non-null   float64       
 4   Close      6509 non-null   float64       
 5   Adj Close  6509 non-null   float64       
 6   Volume     6509 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 356.1 KB


In [5]:
# Displaying summary statistics of the attributes
stock_df.describe()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
count,6509,6509.0,6509.0,6509.0,6509.0,6509.0,6509.0
mean,2011-12-28 18:02:55.879551488,7.846563,7.989922,7.689917,7.846531,7.679767,601773900.0
min,1999-01-22 00:00:00,0.034896,0.035547,0.033333,0.034115,0.01441,19680000.0
25%,2005-07-13 00:00:00,0.279427,0.28725,0.27225,0.27925,0.117953,341316000.0
50%,2011-12-27 00:00:00,0.46275,0.4695,0.45625,0.463,0.212784,503340000.0
75%,2018-06-15 00:00:00,4.45075,4.5145,4.37825,4.4665,4.21925,732676000.0
max,2024-12-03 00:00:00,149.350006,152.889999,146.259995,148.880005,148.880005,9230856000.0
std,,21.342285,21.722479,20.897965,21.325616,21.373121,431118500.0


In [6]:
# Dispaying number of rows and columns
stock_df.shape

(6509, 7)

In [7]:
# Checking null values in the dataset
stock_df.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## ***Feature Engineering***

In [8]:
# Extract date based features
stock_df['Day'] = stock_df['Date'].dt.day
stock_df['Month'] = stock_df['Date'].dt.month
stock_df['Year'] = stock_df['Date'].dt.year
stock_df['Day_of_week'] = stock_df['Date'].dt.weekday
stock_df['Quarter'] = stock_df['Date'].dt.quarter

In [9]:
# Calculate moving average for 50 days and 200 days
stock_df["50_Day_MA"] = stock_df["Close"].rolling(window=50).mean()
stock_df["200_Day_MA"] = stock_df["Close"].rolling(window=200).mean()

In [10]:
# Calculating lag features for previous day data
stock_df["Prev_close"] = stock_df["Close"].shift(1)
stock_df["Prev_volume"] = stock_df["Volume"].shift(1)


In [11]:
# Calculating daily price change
stock_df["Price_change"] = stock_df["Close"] - stock_df["Prev_close"]

In [12]:
# Calculate percentage change
stock_df["Price_change"] = (stock_df["Close"] - stock_df["Prev_close"])/(stock_df["Prev_close"])*100

In [13]:
# Normalise numerical features
scaler = MinMaxScaler()
stock_df[["Open","High","Low","Close","Volume","50_Day_MA","200_Day_MA"]] = scaler.fit_transform(stock_df[["Open","High","Low","Close","Volume","50_Day_MA","200_Day_MA"]])

# ***Buidling Machine learning Model*** 

***Linear regression model***

In [14]:
# Prepare the data
features = ['Open', 'High', 'Low', 'Volume']
X = stock_df[features]
y = stock_df['Close']

In [15]:
# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [16]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [17]:
# Make predictions
y_pred = model.predict(X_test)

In [18]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression - MSE: {mse}, R²: {r2}")

Linear Regression - MSE: 1.2948613761351601e-05, R²: 0.9997815479980221


***XGBoost Model***

In [19]:
# Prepare the data
features = ['Open', 'High', 'Low', 'Volume']
X = stock_df[features]
y = stock_df['Close']

In [20]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [21]:
# Initialize and train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5)
model.fit(X_train, y_train)

In [22]:
# Make predictions
y_pred = model.predict(X_test)

In [23]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGBoost - MSE: {mse}, R²: {r2}")

XGBoost - MSE: 0.09596707944008136, R²: -0.6190305011822597


***LSTM Model***

In [24]:
# Prepare the data
features = ['Open', 'High', 'Low', 'Volume']
X = stock_df[features]
y = stock_df['Close']

In [25]:
# Reshaping data into 3D array for LSTM
X = X.to_numpy()
X = X.reshape((X.shape[0], 1, X.shape[1]))

In [26]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [27]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1))  # Output layer


  super().__init__(**kwargs)


In [28]:
# Compile and fit the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 9.1274e-05
Epoch 2/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 2.5473e-06
Epoch 3/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 2.4471e-06
Epoch 4/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 2.0549e-06
Epoch 5/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.2037e-06
Epoch 6/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 2.0258e-06
Epoch 7/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6989e-06
Epoch 8/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 2.2891e-06
Epoch 9/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.5212e-06
Epoch 10/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7b331b77cc10>

In [29]:
# Make predictions
y_pred = model.predict(X_test)


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step


In [30]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"LSTM - MSE: {mse}")

LSTM - MSE: 0.001931753508023707


***From the above analysis we can conclude that LSTM model perform best as it has the lowest MSE value as compared to other models***