In [17]:
import warnings
import itertools
import numpy as np
import plotly.graph_objects as go
plt.style.use('fivethirtyeight')
import pandas as pd
from time import time
from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

Read in and view a summary of the data.

In [18]:
df = pd.read_csv('TSLA.CSV')
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,264.730016,269.043386,260.100135,264.714937,264.714937,6430629.0
std,53.630784,54.446458,52.691178,53.629164,53.629164,3993741.0
min,142.320007,154.970001,141.050003,143.669998,143.669998,710300.0
25%,220.509998,223.712498,217.042503,219.952503,219.952503,3887650.0
50%,252.765,256.72499,249.394996,253.519996,253.519996,5397450.0
75%,311.192497,315.449997,305.377495,310.402511,310.402511,7608600.0
max,386.690002,389.609985,379.350006,385.0,385.0,33649700.0


### EDA

Set the date columns as the dataframe's index and inspect the data for any missing values (although, I know there isn't any).

In [19]:
df = df.set_index('Date')
df.isnull().sum() 

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

Visualize prices using plotly. Select the 'price' of interest to display in the chart below from the legend. By default, all prices are displayed.

In [20]:
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Open'],
                    mode='lines',
                    name='Opening Price'))
fig.add_trace(go.Scatter(x=df.index, y=df['High'],
                    mode='lines',
                    name='Highest Price'))
fig.add_trace(go.Scatter(x=df.index, y=df['Low'],
                    mode='lines',
                    name='Lowest Price'))
fig.add_trace(go.Scatter(x=df.index, y=df['Close'],
                    mode='lines', name='Closing Price'))
fig.add_trace(go.Scatter(x=df.index, y=df['Adj Close'],
                    mode='lines', name='C_P Adjusted'))

fig.update_layout(
    autosize=False,
    width=1000,
    height=720,

    )

fig.show()

Let's have a separate chart for the volume of Tesla stock traded in order not to suppress the prices.

In [21]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Volume'],
                    mode='lines', name='Volume Traded'))
fig.update_layout(
    autosize=False,
    width=1000,
    height=720,

    )

fig.show()

Taking a closer look at our target variable (Tesla's Adjusted Closing Price). We'll compute and plot the 91 day moving average on top of the prices to show us the trend of the series.

In [22]:
m_avg = df['Adj Close'].rolling(window = 91).mean()

fig = go.Figure()
fig.add_trace(go.Scatter(x = df.index, y=df['Adj Close'],
                    mode='lines', name='TSLA'))
fig.add_trace(go.Scatter(x = df.index,y=m_avg,
                    mode='lines', name='MA'))
fig.update_layout(
    autosize=False,
    width=1000,
    height=720,

    )

fig.show()

To get an idea of how well Tesla has been performing, we have to compute how much is earned (returned) by holding a stock of Tesla. This is computed here as the ratio of the closing prices a day apart minus 1. With positive indicating the stock did well and negetive implying that it didn't. 

In [23]:
# compute stock return

T_return = df['Close']/df['Close'].shift(1) - 1

fig = go.Figure()
fig.add_trace(go.Scatter(x =df.index, y=T_return,
                    mode='lines', name='TSLA Return'))
fig.update_layout(
    autosize=False,
    width=1000,
    height=720,

    )

fig.show()

Ultimately, we want to have positive retuns when we purchase a stock so the rule of thumb is to **buy low and sell high**. If we're able to following that rule, we'll have the closing price for the current day higher than the closing price for the previous day. 

In [24]:
Avg_return = np.round(T_return.mean()*100, 3)
Risk = np.round(T_return.std()*100,3)
print("The average percentage return and risk on TSLA is %f,%s" %(Avg_return,Risk))

The average percentage return and risk on TSLA is 0.022000,2.804


### Feature Engineering



In [25]:
# Create a new feature, High/Low ratio.
df['H_L'] = df['High']/df['Low']
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,H_L
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-09-03,287.670013,288.0,280.100006,281.190002,281.190002,6772300,1.028204
2014-09-04,284.01001,291.420013,280.399994,286.040009,286.040009,8341700,1.039301
2014-09-05,282.549988,282.899994,272.51001,277.390015,277.390015,11169900,1.038127
2014-09-08,277.619995,284.880005,277.519989,282.109985,282.109985,5501600,1.026521
2014-09-09,282.98999,285.48999,277.0,278.480011,278.480011,4558800,1.03065


In [26]:
# Compute and show relationships between varaibles

corrmat = df.corr()

fig = go.Figure(data = go.Heatmap( 
                    x = corrmat.columns,
                    y = corrmat.columns,
                    z = corrmat.values, 
                    colorscale = 'rdbu'))

# set figure dimensions
fig.update_layout(
autosize = False,
width = 950,
height = 900)

fig.update_yaxes(automargin = True)
fig.update_xaxes(automargin = True)
fig.show();


Nothing surprising in the plot above, price is inversely related to volume.

The variables selected for stock prediction are 'H_L' and 'Volume'. With that in mind, we'll carry out some model preprocessing tasks. 

1. Drop variables we won't be using. 
2. Separate the target variable (Adj Close)
3. Scale our features prior to model training 
4. Split data into training (Sep 2014 - Dec 2018) and validation (Jan-Aug 2019) sets.

The selected models used for training and prediction are:
1. Simple linear regression
2. Support Vector Regression
3. Randon Forests

In [27]:
# droping columns and separating target 
New_df = df.drop(columns = ['Open','High','Low','Close',],axis = 1)
Adj_close = df['Adj Close']

# Train - test split
x_train = New_df[:-168]
x_test = New_df[-168:]

y_train = Adj_close[:-168]
y_test = Adj_close[-168:]

In [28]:
# scale features
scale = MinMaxScaler()
x_train  = scale.fit_transform(x_train.values)
x_test = scale.transform(x_test.values)


In [29]:
# Fit models
# Simple Linear Regression

l_reg = LinearRegression()
l_reg.fit(x_train,y_train)

# Support Vector Regression
svr = SVR(C = 100, gamma = 0.1, epsilon =0.1)
svr.fit(x_train,y_train)

# random Forest
rf = GradientBoostingRegressor(random_state = 123)
rf.fit(x_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=123, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [30]:
# observe the mse of the predictions by each model

# linear regression
l_reg_mse = mean_squared_error(y_test,l_reg.predict(x_test))
print("MSE for simple linear regression: %.4f" %l_reg_mse)

# Support Vector Regression
svr_mse = mean_squared_error(y_test,svr.predict(x_test))
print("MSE for Support Vector Regression: %.4f" %svr_mse)

# Random Forest
rf_mse = mean_squared_error(y_test,rf.predict(x_test))
print("MSE for Random Forest: %.4f" %rf_mse)


    

MSE for simple linear regression: 0.0000
MSE for Support Vector Regression: 0.1116
MSE for Random Forest: 0.2120


Our best model is the Simple Linear Regression, its predictions are provided in the table below.

In [31]:
Pred_Table = pd.DataFrame({"Date":New_df.index[-168:],
                           "Linear Regression Predictions": np.round(l_reg.predict(x_test),4)})
Pred_Table

Unnamed: 0,Date,Linear Regression Predictions
0,2019-01-02,310.12
1,2019-01-03,300.36
2,2019-01-04,317.69
3,2019-01-07,334.96
4,2019-01-08,335.35
...,...,...
163,2019-08-26,215.00
164,2019-08-27,214.08
165,2019-08-28,215.59
166,2019-08-29,221.71


In [32]:
# PLot predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index[:-168], y=df['Adj Close'][:-168],
                    mode='lines',
                    name='Adj Close'))
fig.add_trace(go.Scatter(x=df.index[-168:], y=Pred_Table['Linear Regression Predictions'],
                    mode='lines',
                    name='Prediction'))
fig.update_layout(
    autosize=False,
    width=1000,
    height=720,
    title="Tesla - Adjusted Closing Price"
    )

fig.show()