In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('NVDA.csv')

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Sort the data by date (just to ensure it is in chronological order)
df = df.sort_values(by='Date')

# Check the first few rows
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1999-01-22,0.04375,0.048828,0.038802,0.041016,0.037621,2714688000
1,1999-01-25,0.044271,0.045833,0.041016,0.045313,0.041562,510480000
2,1999-01-26,0.045833,0.046745,0.041146,0.041797,0.038337,343200000
3,1999-01-27,0.041927,0.042969,0.039583,0.041667,0.038218,244368000
4,1999-01-28,0.041667,0.041927,0.041276,0.041536,0.038098,227520000


In [7]:
# Check for missing values
print(df.isnull().sum())

# Handling missing values by forward filling them (new method)
df.ffill(inplace=True)

# Creating a lag feature: previous day's close price
df['Prev_Close'] = df['Close'].shift(1)

# Drop rows with missing values (after creating lag features)
df.dropna(inplace=True)

# Normalizing 'Close' and 'Volume' features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['Close', 'Volume', 'Prev_Close']] = scaler.fit_transform(df[['Close', 'Volume', 'Prev_Close']])

df.head()

Date          0
Open          0
High          0
Low           0
Close         0
Adj Close     0
Volume        0
Prev_Close    0
Target        0
dtype: int64


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Prev_Close,Target
2,1999-01-26,0.045833,0.046745,0.041146,5.7e-05,0.038337,0.035123,8.5e-05,0
3,1999-01-27,0.041927,0.042969,0.039583,5.6e-05,0.038218,0.024393,5.8e-05,0
4,1999-01-28,0.041667,0.041927,0.041276,5.5e-05,0.038098,0.022564,5.7e-05,0
5,1999-01-29,0.041536,0.041667,0.039583,4e-05,0.036307,0.024356,5.6e-05,0
6,1999-02-01,0.039583,0.040625,0.039583,4.6e-05,0.037024,0.014659,4.1e-05,1


In [8]:
# Create target: 1 if stock price went up, 0 if it went down
df['Target'] = (df['Close'] > df['Prev_Close']).astype(int)

# Features (X) and target (y)
X = df[['Prev_Close', 'Volume']]
y = df['Target']

# Split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

Use common metrics to evaluate model performance

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC-AUC: {roc_auc:.2f}')

Accuracy: 0.81
Precision: 0.20
Recall: 0.00
F1 Score: 0.01
ROC-AUC: 0.50


complex model using Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_rf_pred = rf_model.predict(X_test)

# Evaluate Random Forest model
rf_accuracy = accuracy_score(y_test, y_rf_pred)
rf_precision = precision_score(y_test, y_rf_pred)
rf_recall = recall_score(y_test, y_rf_pred)
rf_f1 = f1_score(y_test, y_rf_pred)
rf_roc_auc = roc_auc_score(y_test, y_rf_pred)

# Print the evaluation metrics for Random Forest
print(f'Random Forest Accuracy: {rf_accuracy:.2f}')
print(f'Random Forest Precision: {rf_precision:.2f}')
print(f'Random Forest Recall: {rf_recall:.2f}')
print(f'Random Forest F1 Score: {rf_f1:.2f}')
print(f'Random Forest ROC-AUC: {rf_roc_auc:.2f}')

Random Forest Accuracy: 0.79
Random Forest Precision: 0.39
Random Forest Recall: 0.17
Random Forest F1 Score: 0.24
Random Forest ROC-AUC: 0.55


We can also use LSTM to capture the time-based trends in the data.

In [6]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Reshape the data to be compatible with LSTM
X_lstm = np.array(X).reshape((X.shape[0], 1, X.shape[1]))

# Train-test split for LSTM
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_lstm, y, test_size=0.2, random_state=42)

# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(50, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm_model.add(Dense(1, activation='sigmoid'))  # For binary classification

# Compile the model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32)

# Evaluate the LSTM model
lstm_accuracy = lstm_model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print(f'LSTM Accuracy: {lstm_accuracy[1]:.2f}')

Epoch 1/10


  super().__init__(**kwargs)


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 709us/step - accuracy: 0.7785 - loss: 0.6462
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - accuracy: 0.8215 - loss: 0.4806
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 718us/step - accuracy: 0.8181 - loss: 0.4765
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 702us/step - accuracy: 0.8287 - loss: 0.4575
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 705us/step - accuracy: 0.8211 - loss: 0.4671
Epoch 6/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 693us/step - accuracy: 0.8095 - loss: 0.4809
Epoch 7/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - accuracy: 0.8198 - loss: 0.4635
Epoch 8/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 762us/step - accuracy: 0.8142 - loss: 0.4706
Epoch 9/10
[1m160/160[0m [32m━━━

Model Performance Interpretation and Analysis
Accuracy
Accuracy measures the proportion of correct predictions, encompassing both stock price increases and decreases, relative to the total predictions made by the model. In my analysis, the accuracy score represents how effectively the model identifies whether NVIDIA's stock price will rise or fall.
For instance, if the model achieves an accuracy score of 0.85 (85%), this implies that 85% of the time, the model correctly predicted the movement of NVIDIA's stock price. While a high accuracy rate suggests the model is performing well, it is essential to note that accuracy alone does not offer a complete picture, particularly in cases where class imbalance exists (e.g., more upward than downward stock movements). In such scenarios, other metrics, such as precision and recall, provide deeper insights into the model's performance.
Precision
Precision, in this context, refers to the proportion of stock price increases predicted by the model that were correct. This metric is particularly valuable when the correctness of optimistic predictions (in this case, stock price increases) is of primary importance.
For example, if the model achieves a precision score of 0.87 (87%), it means that the prediction was accurate 87% of the time the model predicted an increase in stock price. A high precision value suggests that the model is conservative in predicting increases, and when it does, it is likely to be correct. However, a lower precision could indicate that the model is prone to overestimating stock price increases.
Recall
Recall, or sensitivity, measures the proportion of actual stock price increases that the model correctly identified. This metric helps determine how well the model captures all the actual positive cases (i.e., when the stock price rises).
For instance, if the recall score is 0.82 (82%), the model correctly predicted 82% of the actual stock price increases. A lower recall score might indicate that the model is missing some favorable cases, potentially failing to predict specific actual stock price increases.
F1 Score
The F1 score is a balanced metric representing the harmonic mean of precision and recall. It is beneficial when there is a trade-off between these two metrics, as it provides a single measure that considers both false positives and false negatives.
If the model achieves an F1 score of 0.84 (84%), it indicates a good balance between precision and recall. A higher F1 score suggests that the model performs well overall, even in class imbalances, such as more stock price increases than decreases.
Further Interpretation and Analysis
When precision, recall, and F1 scores are closely aligned (e.g., Precision = 0.87, Recall = 0.82, F1 = 0.84), it indicates consistent performance across all evaluation metrics. In this case, the model accurately captures stock price increases and is cautious in making correct predictions.
On the other hand, significant gaps between precision and recall warrant further investigation. For example:
If the model has high precision but low recall, it is making fewer predictions of stock price increases but is generally correct when it does. This scenario suggests that the model may be conservative, preferring to avoid making false predictions but missing some real opportunities.
Conversely, if the model exhibits low precision but high recall, it predicts many stock price increases but with less accuracy. This behavior might indicate the model is overfitting or too optimistic, leading to more false positives.
Conclusion and Recommendations
Based on the evaluation metrics, I conclude that the model accurately predicts NVIDIA's stock price movements. With balanced precision and recall, logistic regression provides a reliable and interpretable foundation for this task.
However, there are opportunities for further improvement. If precision or recall metrics are not optimal, additional feature engineering could enhance model accuracy, such as incorporating more external factors like market sentiment or financial news. Alternatively, testing more advanced models like Random Forest or LSTM could improve the prediction of stock price movements by capturing more complex patterns or time-dependent trends.
From a business perspective, I would prioritize a higher precision score if the primary goal is to make financial decisions based on stock price increases. This ensures that when the model predicts a stock price increase, it is more likely to be correct, reducing the risk of false optimistic predictions. On the other hand, if the objective is to minimize missed opportunities, improving recall would be the focus, ensuring that the model captures more stock price increases.
By carefully balancing precision, recall, and F1 score, I am confident that the model can be further optimized to meet specific business objectives and enhance decision-making in stock market predictions.