In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [165]:
data = pd.read_csv("NAS.csv")

In [166]:
data.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2003-12-18,19.482599,19.596901,19.025499,19.139799,19.139799,4978496.0
1,2003-12-19,19.368299,19.425501,18.282801,18.454201,18.454201,1410901.0
2,2003-12-22,18.739901,18.739901,17.997101,18.0543,18.0543,137047.0
3,2003-12-23,17.997101,17.997101,17.3687,17.4258,17.4258,229418.0
4,2003-12-24,,,,,,
5,2003-12-25,,,,,,
6,2003-12-26,,,,,,
7,2003-12-29,17.4258,17.4258,16.8545,17.2544,17.2544,196206.0
8,2003-12-30,17.4258,17.8829,17.4258,17.7115,17.7115,67298.0
9,2003-12-31,,,,,,


In [167]:
data.info()
# I am here just checking for how many null values there are in the data. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4253 entries, 0 to 4252
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       4253 non-null   object 
 1   Open       4218 non-null   float64
 2   High       4218 non-null   float64
 3   Low        4218 non-null   float64
 4   Close      4218 non-null   float64
 5   Adj Close  4218 non-null   float64
 6   Volume     4218 non-null   float64
dtypes: float64(6), object(1)
memory usage: 232.7+ KB


In [168]:
data.isnull().sum()/len(data)*100

Date         0.000000
Open         0.822949
High         0.822949
Low          0.822949
Close        0.822949
Adj Close    0.822949
Volume       0.822949
dtype: float64

In [169]:
# Almost 1% of the data has null values, so I will be removing those. 
data.dropna(inplace=True)
data.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2003-12-18,19.482599,19.596901,19.025499,19.139799,19.139799,4978496.0
1,2003-12-19,19.368299,19.425501,18.282801,18.454201,18.454201,1410901.0
2,2003-12-22,18.739901,18.739901,17.997101,18.0543,18.0543,137047.0
3,2003-12-23,17.997101,17.997101,17.3687,17.4258,17.4258,229418.0
7,2003-12-29,17.4258,17.4258,16.8545,17.2544,17.2544,196206.0
8,2003-12-30,17.4258,17.8829,17.4258,17.7115,17.7115,67298.0
11,2004-01-02,17.940001,18.0543,17.7686,17.7686,17.7686,322923.0
12,2004-01-05,18.0543,18.0543,17.7115,17.7115,17.7115,182782.0
13,2004-01-06,17.940001,17.997101,17.5972,17.5972,17.5972,300054.0
14,2004-01-07,17.4258,17.7115,17.1401,17.1401,17.1401,189937.0


In [170]:
# Just to make sure everything worked.
data.isnull().sum()/len(data)*100

Date         0.0
Open         0.0
High         0.0
Low          0.0
Close        0.0
Adj Close    0.0
Volume       0.0
dtype: float64

In [171]:
# Working with a full date like this proved difficult so I am removing the full Date and added collumns for each value
data["Date"] = pd.to_datetime(data["Date"])
data["year"] = data["Date"].dt.year
data["month"] = data["Date"].dt.month
data["day"] = data["Date"].dt.day
data = data.drop("Date", axis=1)
data.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,year,month,day
0,19.482599,19.596901,19.025499,19.139799,19.139799,4978496.0,2003,12,18
1,19.368299,19.425501,18.282801,18.454201,18.454201,1410901.0,2003,12,19
2,18.739901,18.739901,17.997101,18.0543,18.0543,137047.0,2003,12,22
3,17.997101,17.997101,17.3687,17.4258,17.4258,229418.0,2003,12,23
7,17.4258,17.4258,16.8545,17.2544,17.2544,196206.0,2003,12,29


In [172]:
# Now I can move the Close collumn to a seperate array as this is what we are trying to find, 
# and removing the rest of the data we are not suppose to use. 
labels = np.array(data["Close"])
data = data.drop(["Open", "High", "Low", "Close", "Adj Close", "Volume"], axis = 1)
data = np.array(data)

In [173]:
# Splitting the data set so that the model can train on 70% of it and then test itself on the remaining 30%.
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size = 0.3, random_state=0)
# Checking that everything went alright that there are the same amount of training and test columns for both.
print("Training Data Shape: ", data_train.shape)
print("Training Labels Shape: ", labels_train.shape)
print("Testing Data Shape: ", data_test.shape)
print("Testing Labels Shape: ", labels_test.shape)

Training Data Shape:  (2952, 3)
Training Labels Shape:  (2952,)
Testing Data Shape:  (1266, 3)
Testing Labels Shape:  (1266,)


In [174]:
# Looks good, same amount in both training, aswell as in testing.

In [175]:
# Training the model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=500, random_state = 0)
rf.fit(data_train, labels_train)

RandomForestRegressor(n_estimators=500, random_state=0)

In [176]:
# Calculating the error margin and the prediction score. 
prediction = rf.predict(data_test)
errors = abs(prediction - labels_test)
print("Mean Absolute Error: ", round(np.mean(errors), 4))

Mean Absolute Error:  1.8985


In [177]:
print("Score: ", rf.score(data_test, labels_test))

Score:  0.9963658664955594


In [178]:
# Eh... This must be wrong... The Mean Error is pretty high too from what I can tell, so how can the score be 99?