In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
#loading dataset
df = pd.read_csv('nvidia_stock_2015_to_2024.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date,open,high,low,close,adjclose,volume
0,0,2015-01-02,0.50325,0.507,0.49525,0.50325,0.483218,113680000
1,1,2015-01-05,0.50325,0.50475,0.4925,0.49475,0.475056,197952000
2,2,2015-01-06,0.4955,0.496,0.47925,0.47975,0.460654,197764000
3,3,2015-01-07,0.48325,0.4875,0.477,0.4785,0.459453,321808000
4,4,2015-01-08,0.484,0.4995,0.48375,0.4965,0.476737,283780000


In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,open,high,low,close,adjclose,volume
count,2369.0,2369.0,2369.0,2369.0,2369.0,2369.0,2369.0
mean,1184.0,14.188477,14.445906,13.919531,14.197878,14.169242,478752600.0
std,684.015716,18.683473,19.020099,18.31567,18.691694,18.697431,255987400.0
min,0.0,0.48125,0.4875,0.4735,0.4785,0.459453,52448000.0
25%,592.0,2.99825,3.0455,2.9475,3.03225,2.992067,318800000.0
50%,1184.0,6.1915,6.26275,6.0915,6.17825,6.1299,427796000.0
75%,1776.0,17.915001,18.243999,17.634001,17.983999,17.963766,574588000.0
max,2368.0,114.650002,115.819,110.901001,114.824997,114.815567,3692928000.0


In [4]:
df.columns

Index(['Unnamed: 0', 'date', 'open', 'high', 'low', 'close', 'adjclose',
       'volume'],
      dtype='object')

In [5]:
df.isna().sum()

Unnamed: 0    0
date          0
open          0
high          0
low           0
close         0
adjclose      0
volume        0
dtype: int64

In [6]:
df.drop(columns=['Unnamed: 0','adjclose'], inplace = True)

In [7]:
df

Unnamed: 0,date,open,high,low,close,volume
0,2015-01-02,0.503250,0.507000,0.495250,0.503250,113680000
1,2015-01-05,0.503250,0.504750,0.492500,0.494750,197952000
2,2015-01-06,0.495500,0.496000,0.479250,0.479750,197764000
3,2015-01-07,0.483250,0.487500,0.477000,0.478500,321808000
4,2015-01-08,0.484000,0.499500,0.483750,0.496500,283780000
...,...,...,...,...,...,...
2364,2024-05-24,104.448997,106.474998,103.000000,106.469002,429494000
2365,2024-05-28,110.244003,114.939003,109.883003,113.901001,652728000
2366,2024-05-29,113.050003,115.491997,110.901001,114.824997,557442000
2367,2024-05-30,114.650002,115.819000,109.663002,110.500000,487350000


In [8]:
# Feature Extraction
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_the_week'] = df['date'].dt.dayofweek

df['Average_price'] = (df['high']/df['low'])/2
df['Price_Range'] = df['high'] - df['low']

In [9]:
df.drop(columns=['date'],inplace= True)

In [10]:
# Defing traing data and test data
X = df[['open', 'close', 'volume', 'year', 'month', 'day_of_the_week', 'Average_price', 'Price_Range']]
y = df['close']

In [11]:
# Splitting the dataset 
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print(y_pred)

Mean Squared Error: 0.034856195740833755
[  4.25449991   0.59299999  24.06299973   4.92000008   4.26149988
  13.13000011   0.71749997  22.20299911   0.92825001  24.22800064
   2.93300009   5.35349989   1.74899995   4.92000008   2.5462501
   0.8075      10.05224991   0.5535       2.64575005  14.92650032
  19.03199959   0.52525002   2.4607501    4.52675009   0.79325002
   5.93849993   5.01774979   0.52525002  16.50099945  16.29500008
   6.22700024   0.63550001   8.75024986   4.09224987  19.90500069
  14.24825001   5.24025011  15.8380003    3.72799993   4.76674986
  13.66524982  17.42499924   0.79325002  17.94199944   3.34450006
  31.67499924   1.69324994  13.52474976   6.18200016   6.56449986
  46.83499908   6.19224977  16.61000061   3.74675012   6.37099981
   1.31949997   2.34124994   3.78625011   1.57274997   1.11000001
   1.50999999   6.05375004   4.19675016   4.89225006  45.46900177
   0.58125001  48.7840004   17.42499924   1.62549996  15.8579998
   1.65324998  13.73649979   2.665999

In [14]:
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

r2 = r2_score(y_test, y_pred)
print("R-squared (R²):", r2)


Mean Absolute Error (MAE): 0.055430929872054044
R-squared (R²): 0.9998866788708963


In [15]:
import pickle

In [21]:
# Save the model to a file
with open(r'/Users\Siddhi Sankhe\Desktop\Compozent Task\ML_Project\ML_model.pkl', 'wb') as file:
    pickle.dump(model, file)
