# Stock price prediction using Machine Learning

### Using different Machine Learning Regression Models to find out which one is the best fit to predict stock price development on example of historical data of Microsoft Corporation Common Stock (MSFT)

Source: https://finance.yahoo.com/quote/MSFT/history/

### Data preparation

### Importing libraries

In [445]:
import numpy as np
import plotly.express as px
import pandas as pd

In [446]:
dataset = pd.read_csv('MSFT_5Y.csv')
dataset

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-12-26,95.139999,100.690002,93.959999,100.559998,95.491508,51634800
1,2018-12-27,99.300003,101.190002,96.400002,101.180000,96.080238,49498500
2,2018-12-28,102.089996,102.410004,99.519997,100.389999,95.330048,38196300
3,2018-12-31,101.290001,102.400002,100.440002,101.570000,96.450569,33173800
4,2019-01-02,99.550003,101.750000,98.940002,101.120003,96.023247,35329300
...,...,...,...,...,...,...,...
1253,2023-12-18,369.450012,373.000000,368.679993,372.649994,372.649994,21802900
1254,2023-12-19,371.489990,373.260010,369.839996,373.260010,373.260010,20603700
1255,2023-12-20,375.000000,376.029999,370.529999,370.619995,370.619995,26316700
1256,2023-12-21,372.559998,374.410004,370.040009,373.540009,373.540009,17708000


### Checking data types and nan values

In [447]:
df = dataset.copy()
df.isna().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [448]:
df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

### Dropping unnecessary columns and setting date as an index

In [449]:
df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)

In [450]:
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-12-26,95.139999,100.690002,93.959999,100.559998
2018-12-27,99.300003,101.190002,96.400002,101.180000
2018-12-28,102.089996,102.410004,99.519997,100.389999
2018-12-31,101.290001,102.400002,100.440002,101.570000
2019-01-02,99.550003,101.750000,98.940002,101.120003
...,...,...,...,...
2023-12-18,369.450012,373.000000,368.679993,372.649994
2023-12-19,371.489990,373.260010,369.839996,373.260010
2023-12-20,375.000000,376.029999,370.529999,370.619995
2023-12-21,372.559998,374.410004,370.040009,373.540009


### Plot a line chart of open and close stock prices for each day

In [451]:
fig = px.line(df[['Open', 'Close']], x=df.index, y=['Open', 'Close'],
              labels={'value': 'MSFT'},
              title='Line Chart with Open and Close Values of MSFT',
              line_shape='linear', render_mode='svg')
fig.show()

### Dividing the data frame into dependent and independent values

In [452]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

### Now let's try different types of regression and find out which one is best fit for our dataset

### 1. Support vector regression
First transforming `y` from horizontal to vertical vector representation

In [453]:
y_svr = y.reshape(len(y), 1)

#### Splitting dataset into training set and test set

In [454]:
from sklearn.model_selection import train_test_split
X_train_svr, X_test_svr, y_train_svr, y_test_svr = train_test_split(X, y_svr, test_size=0.2, random_state = 0)

#### Feature scaling

In [455]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train_svr = sc_X.fit_transform(X_train_svr)
y_train_svr = sc_y.fit_transform(y_train_svr)

#### Training SVR model on training set

In [456]:
from sklearn.svm import SVR
sv_regressor = SVR(kernel = 'rbf')
sv_regressor.fit(X_train_svr, y_train_svr)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



#### Predicting test set results

In [457]:
y_pred_svr = sc_y.inverse_transform(sv_regressor.predict(sc_X.transform(X_test_svr)).reshape(-1, 1))
np.set_printoptions(precision=2)
np.concatenate((y_pred_svr.reshape(len(y_pred_svr), 1), y_test.reshape(len(y_test_svr), 1)), 1)

array([[108.3 ,  97.4 ],
       [212.19, 210.52],
       [113.58, 114.5 ],
       [243.87, 245.03],
       [137.1 , 139.14],
       [109.59, 106.71],
       [161.92, 157.71],
       [214.23, 212.25],
       [188.51, 188.36],
       [246.71, 249.07],
       [251.39, 249.31],
       [329.07, 333.2 ],
       [277.67, 278.85],
       [215.13, 216.23],
       [326.05, 323.01],
       [318.  , 315.26],
       [300.51, 304.06],
       [331.27, 327.78],
       [127.07, 130.6 ],
       [336.4 , 338.37],
       [265.71, 262.97],
       [133.43, 133.43],
       [303.85, 307.29],
       [215.1 , 216.01],
       [326.85, 326.66],
       [143.44, 144.19],
       [255.89, 253.14],
       [251.92, 254.08],
       [223.05, 222.86],
       [279.96, 281.4 ],
       [132.92, 136.13],
       [304.46, 307.26],
       [339.68, 337.99],
       [112.38, 112.26],
       [282.11, 277.35],
       [298.65, 301.37],
       [299.9 , 304.1 ],
       [326.1 , 328.65],
       [212.24, 210.39],
       [138.34, 140.4 ],


#### Evaluating the model performance

In [458]:
from sklearn.metrics import r2_score
r2_score(y_test_svr, y_pred_svr)

0.9985514319922093

### 2. Multiple linear regression

#### Splitting dataset into training set and test set

In [459]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#### Training the Multiple Linear Regression model on the Training set

In [460]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

#### Predicting the Test set results

In [461]:
y_pred_mlr = regressor.predict(X_test)
np.concatenate((y_pred.reshape(len(y_pred_mlr),1) , y_test.reshape(len(y_test), 1)), 1)

array([[ 97.77,  97.4 ],
       [211.86, 210.52],
       [114.67, 114.5 ],
       [242.77, 245.03],
       [139.03, 139.14],
       [106.32, 106.71],
       [161.57, 157.71],
       [215.78, 212.25],
       [187.  , 188.36],
       [248.94, 249.07],
       [250.03, 249.31],
       [330.83, 333.2 ],
       [276.58, 278.85],
       [214.67, 216.23],
       [320.8 , 323.01],
       [318.45, 315.26],
       [303.53, 304.06],
       [330.9 , 327.78],
       [130.33, 130.6 ],
       [338.84, 338.37],
       [262.05, 262.97],
       [133.76, 133.43],
       [306.97, 307.29],
       [215.73, 216.01],
       [329.52, 326.66],
       [144.84, 144.19],
       [255.54, 253.14],
       [250.6 , 254.08],
       [223.03, 222.86],
       [279.97, 281.4 ],
       [136.2 , 136.13],
       [307.8 , 307.26],
       [338.69, 337.99],
       [111.41, 112.26],
       [276.79, 277.35],
       [302.5 , 301.37],
       [302.94, 304.1 ],
       [327.74, 328.65],
       [211.42, 210.39],
       [142.2 , 140.4 ],


#### Evaluating the Model Performance

In [462]:
r2_score(y_test, y_pred_mlr)

0.9995115477497789

### 3. Polynomial regression

#### Training the Polynomial Regression model on the Training set

In [463]:
from sklearn.preprocessing import PolynomialFeatures
plf = PolynomialFeatures(degree = 4)
pol_lin_regressor = LinearRegression()
X_poly = plf.fit_transform(X_train)
pol_lin_regressor.fit(X_poly, y_train)

#### Predicting the Test set results

In [464]:
y_pred_plr = pol_lin_regressor.predict(plf.transform(X_test))
np.concatenate((y_pred_plr.reshape(len(y_pred_plr), 1), y_test.reshape(len(y_test), 1)), 1)

array([[ 96.24,  97.4 ],
       [212.38, 210.52],
       [115.12, 114.5 ],
       [243.27, 245.03],
       [139.57, 139.14],
       [106.43, 106.71],
       [162.54, 157.71],
       [214.98, 212.25],
       [187.39, 188.36],
       [246.76, 249.07],
       [250.27, 249.31],
       [330.45, 333.2 ],
       [277.16, 278.85],
       [215.14, 216.23],
       [318.27, 323.01],
       [318.19, 315.26],
       [303.26, 304.06],
       [329.95, 327.78],
       [130.97, 130.6 ],
       [339.23, 338.37],
       [262.45, 262.97],
       [131.29, 133.43],
       [306.63, 307.29],
       [215.97, 216.01],
       [328.4 , 326.66],
       [145.34, 144.19],
       [255.96, 253.14],
       [251.56, 254.08],
       [223.5 , 222.86],
       [280.08, 281.4 ],
       [136.87, 136.13],
       [307.35, 307.26],
       [338.81, 337.99],
       [110.39, 112.26],
       [276.45, 277.35],
       [301.5 , 301.37],
       [302.53, 304.1 ],
       [327.71, 328.65],
       [211.8 , 210.39],
       [144.23, 140.4 ],


#### Evaluating the Model Performance

In [465]:
r2_score(y_test, y_pred_plr)

0.9993399501982351

### 4. Decision tree regression

#### Training the Decision Tree Regression model on the Training set

In [466]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(random_state = 0)
dt_regressor.fit(X_train, y_train)

#### Predicting the Test set results

In [467]:
y_pred_dtr = dt_regressor.predict(X_test)
np.concatenate((y_pred_dtr.reshape(len(y_pred_dtr), 1), y_test.reshape(len(y_test), 1)), 1)

array([[101.12,  97.4 ],
       [212.42, 210.52],
       [114.59, 114.5 ],
       [241.73, 245.03],
       [139.54, 139.14],
       [106.2 , 106.71],
       [162.28, 157.71],
       [213.02, 212.25],
       [188.94, 188.36],
       [242.26, 249.07],
       [249.68, 249.31],
       [331.85, 333.2 ],
       [277.42, 278.85],
       [215.37, 216.23],
       [319.97, 323.01],
       [321.01, 315.26],
       [304.21, 304.06],
       [329.01, 327.78],
       [129.77, 130.6 ],
       [339.71, 338.37],
       [262.15, 262.97],
       [133.98, 133.43],
       [302.38, 307.29],
       [214.8 , 216.01],
       [326.79, 326.66],
       [145.96, 144.19],
       [253.25, 253.14],
       [255.29, 254.08],
       [223.94, 222.86],
       [279.93, 281.4 ],
       [135.69, 136.13],
       [310.2 , 307.26],
       [338.7 , 337.99],
       [112.36, 112.26],
       [277.66, 277.35],
       [296.03, 301.37],
       [303.59, 304.1 ],
       [326.79, 328.65],
       [212.42, 210.39],
       [146.57, 140.4 ],


#### Evaluating the Model Performance

In [468]:
r2_score(y_test, y_pred_dtr)

0.9988821632499932

### 5. Random forest regression

#### Training the Random Forest Regression model on the whole dataset

In [469]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
rf_regressor.fit(X_train, y_train)

#### Predicting the Test set results

In [470]:
y_pred_rfr = rf_regressor.predict(X_test)
np.concatenate((y_pred_rfr.reshape(len(y_pred_rfr), 1), y_test.reshape(len(y_test), 1)), 1)

array([[100.94,  97.4 ],
       [212.83, 210.52],
       [115.22, 114.5 ],
       [243.33, 245.03],
       [139.2 , 139.14],
       [106.11, 106.71],
       [161.41, 157.71],
       [214.41, 212.25],
       [188.2 , 188.36],
       [247.25, 249.07],
       [250.43, 249.31],
       [331.64, 333.2 ],
       [277.59, 278.85],
       [214.31, 216.23],
       [321.2 , 323.01],
       [319.12, 315.26],
       [304.84, 304.06],
       [329.04, 327.78],
       [129.81, 130.6 ],
       [339.  , 338.37],
       [264.08, 262.97],
       [133.85, 133.43],
       [304.01, 307.29],
       [214.88, 216.01],
       [329.77, 326.66],
       [145.67, 144.19],
       [253.48, 253.14],
       [253.33, 254.08],
       [222.88, 222.86],
       [279.72, 281.4 ],
       [135.45, 136.13],
       [306.47, 307.26],
       [338.65, 337.99],
       [112.55, 112.26],
       [279.18, 277.35],
       [299.02, 301.37],
       [301.18, 304.1 ],
       [330.7 , 328.65],
       [212.79, 210.39],
       [140.77, 140.4 ],


#### Evaluating the Model Performance

In [471]:
r2_score(y_test, y_pred_rfr)

0.999216130134255

### Conclusion

It seems to be, that Multiple Linear Regression with `r2 score = 0.9995115477497789` is the best fit to predict stock prices in our particular case, even though Random Forest and Polynomial Regression did pretty well