# Predicting the stock market

Historical data on the price of the S&P500 Index will be used to make predictions about future prices. Predicting whether an index will go up or down will help to forecast how the stock market as a whole will perform. Since stocks tend to correlate with how well the economy as a whole is performing, it can also help to make economic forecasts.

In [170]:
import pandas as pd
import numpy as np
from datetime import datetime

In [171]:
df = pd.read_csv('sphist.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date', ascending = True)

In [172]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08


In [173]:
df.reset_index(inplace = True, drop = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16590 entries, 0 to 16589
Data columns (total 7 columns):
Date         16590 non-null datetime64[ns]
Open         16590 non-null float64
High         16590 non-null float64
Low          16590 non-null float64
Close        16590 non-null float64
Volume       16590 non-null float64
Adj Close    16590 non-null float64
dtypes: datetime64[ns](1), float64(6)
memory usage: 907.4 KB


### Generating indicators

* The average price from the past 5 days.
* The average price for the past 30 days.
* The average price for the past 365 days.

In [174]:
# create columns
df['avg_5'] = np.nan
df['avg_30'] = np.nan
df['avg_365'] = np.nan

In [175]:
# calculate new indicators
for i in range(0, len(df)):
    if i>=365:
        df['avg_5'][i] = np.mean(df['Close'][i-5:i])
        df['avg_30'][i] = np.mean(df['Close'][i-30:i])
        df['avg_365'][i] = np.mean(df['Close'][i-365:i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [176]:
# left only rows with estimated indicators
df_n = df.iloc[365:,:].copy()
df_n.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,avg_5,avg_30,avg_365
365,1951-06-19,22.02,22.02,22.02,22.02,1100000.0,22.02,21.8,21.703333,19.447726
366,1951-06-20,21.91,21.91,21.91,21.91,1120000.0,21.91,21.9,21.683,19.462411
367,1951-06-21,21.780001,21.780001,21.780001,21.780001,1100000.0,21.780001,21.972,21.659667,19.476274
368,1951-06-22,21.549999,21.549999,21.549999,21.549999,1340000.0,21.549999,21.96,21.631,19.489562
369,1951-06-25,21.290001,21.290001,21.290001,21.290001,2440000.0,21.290001,21.862,21.599,19.502082


### Making predictions

In [177]:
# generating train and test
train = df_n[df_n["Date"] < datetime(year=2013, month=1, day=1)].copy()
test = df_n[df_n["Date"] >= datetime(year=2013, month=1, day=1)].copy()

In [178]:
# libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

In [179]:
# train model with selected indicators
lr = LinearRegression()
features = ['avg_5','avg_30', 'avg_365']
lr.fit(train[features], train['Close'])

# make predictions
prediction = lr.predict(test[features])
mse_1 = mse(prediction,test['Close'])
rmse_1 = mse_1**0.5

#print rmse
rmse_1

22.22006532421984

In [180]:
mae = (prediction - test['Close']).abs().mean()
# print mae
mae

16.142439643554862

### Improving error

In [181]:
# adding average volume for past 5 days
df['avg_5_vol'] = np.nan
df['Year'] = np.nan
for i in range(0, len(df)):
    if i>=365:
        df['avg_5_vol'][i] = np.mean(df['Volume'][i-5:i])
        df['Year'][i] = df.iloc[i,0].year

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [182]:
# left only rows with estimated indicators
df_n = df.iloc[365:,:].copy()
df_n.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,avg_5,avg_30,avg_365,avg_5_vol,Year
365,1951-06-19,22.02,22.02,22.02,22.02,1100000.0,22.02,21.8,21.703333,19.447726,1196000.0,1951.0
366,1951-06-20,21.91,21.91,21.91,21.91,1120000.0,21.91,21.9,21.683,19.462411,1176000.0,1951.0
367,1951-06-21,21.780001,21.780001,21.780001,21.780001,1100000.0,21.780001,21.972,21.659667,19.476274,1188000.0,1951.0
368,1951-06-22,21.549999,21.549999,21.549999,21.549999,1340000.0,21.549999,21.96,21.631,19.489562,1148000.0,1951.0
369,1951-06-25,21.290001,21.290001,21.290001,21.290001,2440000.0,21.290001,21.862,21.599,19.502082,1142000.0,1951.0


In [183]:
# generating train and test
train = df_n[df_n["Date"] < datetime(year=2013, month=1, day=1)].copy()
test = df_n[df_n["Date"] >= datetime(year=2013, month=1, day=1)].copy()

In [186]:
## train with average volume and Year component in Data
# train model with selected indicators
lr2 = LinearRegression()
features = ['avg_5','avg_30', 'avg_365', 'avg_5_vol','Year']
lr2.fit(train[features], train['Close'])

# make predictions
prediction = lr2.predict(test[features])
mse_2 = mse(prediction,test['Close'])
rmse_2 = mse_2**0.5

#print rmse
print( rmse_1, rmse_2)

22.22006532421984 22.241452434792386


In [185]:
mae_2 = (prediction - test['Close']).abs().mean()
# print mae
print(mae, mae_2)

16.142439643554862 16.194872512262982


In [187]:
## train with average volume
# train model with selected indicators
lr2 = LinearRegression()
features = ['avg_5','avg_30', 'avg_365', 'avg_5_vol']
lr2.fit(train[features], train['Close'])

# make predictions
prediction = lr2.predict(test[features])
mse_3 = mse(prediction,test['Close'])
rmse_3 = mse_3**0.5

#print rmse
print( rmse_1, rmse_2, rmse_3)

mae_3 = (prediction - test['Close']).abs().mean()
# print mae
print(mae, mae_2, mae_3)

22.22006532421984 22.241452434792386 22.22399778506925
16.142439643554862 16.194872512262982 16.147417791850625


In [188]:
## train with Year component in Data
# train model with selected indicators
lr2 = LinearRegression()
features = ['avg_5','avg_30', 'avg_365', 'Year']
lr2.fit(train[features], train['Close'])

# make predictions
prediction = lr2.predict(test[features])
mse_4 = mse(prediction,test['Close'])
rmse_4 = mse_4**0.5

#print rmse
print( rmse_1, rmse_2, rmse_3, rmse_4)

mae_4 = (prediction - test['Close']).abs().mean()
# print mae
print(mae, mae_2, mae_3, mae_4)

22.22006532421984 22.241452434792386 22.22399778506925 22.235479853742866
16.142439643554862 16.194872512262982 16.147417791850625 16.18653012933532


Both the Year component in Data and average volume for last 5 days made selected error estimators worse.

### prediction for one day ahead

In [196]:
#reset index
df_n.reset_index(inplace = True, drop = True)
df_n.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,avg_5,avg_30,avg_365,avg_5_vol,Year
16220,2015-12-01,2082.929932,2103.370117,2082.929932,2102.629883,3712120000.0,2102.629883,2087.024023,2073.984998,2035.531178,3207544000.0,2015.0
16221,2015-12-02,2101.709961,2104.27002,2077.110107,2079.51001,3950640000.0,2079.51001,2090.231982,2076.283993,2035.914082,3232372000.0,2015.0
16222,2015-12-03,2080.709961,2085.0,2042.349976,2049.620117,4306490000.0,2049.620117,2088.306006,2077.908659,2036.234356,3245514000.0,2015.0
16223,2015-12-04,2051.23999,2093.840088,2051.23999,2091.689941,4214910000.0,2091.689941,2080.456006,2078.931331,2036.507343,3536224000.0,2015.0
16224,2015-12-07,2090.419922,2090.419922,2066.780029,2077.070068,4043820000.0,2077.070068,2080.771973,2080.237329,2036.869425,4085838000.0,2015.0


In [202]:
# find first index 03-01-2013
first_index = df_n[df_n["Date"] == datetime(year=2013, month=1, day=3)].index[0]

In [209]:
predictions = []
observations = []
features = ['avg_5','avg_30', 'avg_365']

for i in range(first_index, len(df_n)):
    # generating train and test
    train = df_n.iloc[:i,:].copy()
    test = df_n.iloc[i,:].copy()
    
    ## train model
    # train model with selected indicators
    lr2 = LinearRegression()
    lr2.fit(train[features], train['Close'])

    # make predictions
    prediction = lr2.predict(np.array(test[features]).reshape(1, -1))
    predictions.append(prediction)
    observations.append(test['Close'])  
    
mse_5 = mse(predictions,observations)
rmse_5 = mse_5**0.5

#print rmse
print( rmse_1, rmse_5)

mae_5 = (pd.Series(predictions) - pd.Series(observations)).abs().mean()
# print mae
print(mae, mae_5)

22.22006532421984 22.17030018862854
16.142439643554862 [16.06668509]


Error became smaller with prediction for one day ahead 