In [30]:
import pandas as pd
import numpy as np
from datetime import datetime  
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [47]:
sphist = pd.read_csv("sphist.csv")

sphist['Date'] = pd.to_datetime(arg=sphist['Date'], format='%Y-%m-%d')

sphist.sort_values(by='Date', inplace=True, ascending=False)
sphist.reset_index(inplace=True)

print(sphist.head())
print(sphist.tail())
print(sphist.shape)

   index       Date         Open         High          Low        Close  \
0      0 2015-12-07  2090.419922  2090.419922  2066.780029  2077.070068   
1      1 2015-12-04  2051.239990  2093.840088  2051.239990  2091.689941   
2      2 2015-12-03  2080.709961  2085.000000  2042.349976  2049.620117   
3      3 2015-12-02  2101.709961  2104.270020  2077.110107  2079.510010   
4      4 2015-12-01  2082.929932  2103.370117  2082.929932  2102.629883   

         Volume    Adj Close  
0  4.043820e+09  2077.070068  
1  4.214910e+09  2091.689941  
2  4.306490e+09  2049.620117  
3  3.950640e+09  2079.510010  
4  3.712120e+09  2102.629883  
       index       Date   Open   High    Low  Close     Volume  Adj Close
16585  16585 1950-01-09  17.08  17.08  17.08  17.08  2520000.0      17.08
16586  16586 1950-01-06  16.98  16.98  16.98  16.98  2010000.0      16.98
16587  16587 1950-01-05  16.93  16.93  16.93  16.93  2550000.0      16.93
16588  16588 1950-01-04  16.85  16.85  16.85  16.85  1890000.0     

In [57]:
def get_means(df, start, end, cols):
    means = df[(df['Date']>=start) & (df['Date']<=end)][cols].mean()
    return tuple(means[col] for col in cols)

def get_stds(df, start, end, cols):
    stds = df[(df['Date']>=start) & (df['Date']<=end)][cols].std()
    return tuple(stds[col] for col in cols)

end = sphist.loc[0, 'Date'] - timedelta(days=1)
start = end - timedelta(days=365)

price = get_means(sphist, start, end, ['Close'])
std = get_stds(sphist, start, end, ['Volume'])

print(start, end)
print(price, std)

2014-12-06 00:00:00 2015-12-06 00:00:00
(2061.2550607928288,) (685047739.8036069,)


In [51]:
sphist.loc[0, 'Avg Close 5']

0.0

In [58]:
new_cols = ['Avg Close 5', 'Avg Close 365', 'Ratio Close 5 365', 'Avg Vol 365', 'Std Vol 365']
df_zeros = np.zeros((sphist.shape[0], len(new_cols)))
sphist[new_cols] = pd.DataFrame(data=df_zeros, index=sphist.index)

for i, row in sphist.iterrows():
    end = sphist.loc[i, 'Date'] - timedelta(days=1)
    start_5 = end - timedelta(days=5)
    start_365 = end - timedelta(days=365)
    
    sphist.loc[i, 'Avg Close 5'] = get_means(sphist, start_5, end, ['Close'])[0]
    sphist.loc[i, 'Avg Close 365'], sphist.loc[i, 'Avg Vol 365'] = get_means(sphist, start_365, end, ['Close', 'Volume'])
    #sphist.loc[i, 'Std Vol 365'] = get_stds(sphist, start_365, end, ['Avg Vol 365'])[0]
    sphist.loc[i, 'Ratio Close 5 365'] = sphist.loc[i, 'Avg Close 5']/sphist.loc[i, 'Avg Close 365']        

for i, row in sphist.iterrows():
    end = sphist.loc[i, 'Date'] - timedelta(days=1)
    start_365 = end - timedelta(days=365)
    
    sphist.loc[i, 'Std Vol 365'] = get_stds(sphist, start_365, end, ['Avg Vol 365'])[0]    
    
        
print(sphist.head(6)) 
print(sphist.tail(6))

   index       Date         Open         High          Low        Close  \
0      0 2015-12-07  2090.419922  2090.419922  2066.780029  2077.070068   
1      1 2015-12-04  2051.239990  2093.840088  2051.239990  2091.689941   
2      2 2015-12-03  2080.709961  2085.000000  2042.349976  2049.620117   
3      3 2015-12-02  2101.709961  2104.270020  2077.110107  2079.510010   
4      4 2015-12-01  2082.929932  2103.370117  2082.929932  2102.629883   
5      5 2015-11-30  2090.949951  2093.810059  2080.409912  2080.409912   

         Volume    Adj Close  Avg Close 5  Avg Close 365  Ratio Close 5 365  \
0  4.043820e+09  2077.070068  2080.862488    2061.255061           1.009512   
1  4.214910e+09  2091.689941  2078.042481    2061.284389           1.008130   
2  4.306490e+09  2049.620117  2088.164978    2061.351306           1.013008   
3  3.950640e+09  2079.510010  2091.049967    2061.248262           1.014458   
4  3.712120e+09  2102.629883  2086.463379    2061.084049           1.012314   


In [66]:
sphist = sphist[sphist['Date']>=datetime(year=1951, month=1, day=3)]
sphist.dropna(axis=0, inplace=True)

In [67]:
train = sphist[sphist['Date']<datetime(year=2013, month=1, day=1)]
test = sphist[sphist['Date']>=datetime(year=2013, month=1, day=1)]

In [68]:
lr = LinearRegression()
lr.fit(train[new_cols], train['Close'])
y_hat = lr.predict(test[new_cols])
mae = mean_absolute_error(test['Close'], y_hat)
print('mae:', mae)

mae: 14.780195575936458


In [60]:
# new_cols = ['Avg Vol 5', 'Avg Vol 365', 'Std Vol 5 365']
# df = np.zeros((sphist.shape[0], len(new_cols)))
# sphist[new_cols] = pd.DataFrame(data=df, index=sphist.index)

for i, row in sphist.iterrows():
    end = sphist.loc[i, 'Date'] - timedelta(days=1)
    start_365 = end - timedelta(days=365)
    
    sphist.loc[i, 'Std Vol 365'] = get_stds(sphist, start_365, end, ['Avg Vol 365'])[0]

sphist.loc[360:370]

Unnamed: 0,index,Date,Open,High,Low,Close,Volume,Adj Close,Avg Close 5,Avg Close 365,Ratio Close 5 365,Avg Vol 365,Std Vol 365
360,360,2014-07-03,1975.880005,1985.589966,1975.880005,1985.439941,1998090000.0,1985.439941,1967.282471,1795.947431,1.095401,3253908000.0,45713940.0
361,361,2014-07-02,1973.060059,1976.670044,1972.579956,1974.619995,2851480000.0,1974.619995,1962.932465,1794.525849,1.093845,3254909000.0,45462920.0
362,362,2014-07-01,1962.290039,1978.579956,1962.290039,1973.319946,3188240000.0,1973.319946,1959.484985,1793.816349,1.092355,3255174000.0,44887250.0
363,363,2014-06-30,1960.790039,1964.23999,1958.219971,1960.22998,3037350000.0,1960.22998,1956.922485,1793.153347,1.09133,3256042000.0,44303610.0
364,364,2014-06-27,1956.560059,1961.469971,1952.180054,1960.959961,4290590000.0,1960.959961,1957.334991,1790.289605,1.093306,3260379000.0,44669480.0
365,365,2014-06-26,1959.890015,1959.890015,1944.689941,1957.219971,2778840000.0,1957.219971,1958.747497,1788.830356,1.094988,3264262000.0,44435040.0
366,366,2014-06-25,1949.27002,1960.829956,1947.48999,1959.530029,3106710000.0,1959.530029,1958.734985,1787.302925,1.095917,3270693000.0,44210370.0
367,367,2014-06-24,1961.969971,1968.170044,1948.339966,1949.97998,3089700000.0,1949.97998,1960.484985,1786.657381,1.097292,3271411000.0,43742070.0
368,368,2014-06-23,1962.920044,1963.73999,1958.890015,1962.609985,2717630000.0,1962.609985,1955.329986,1785.956375,1.094836,3273617000.0,43282890.0
369,369,2014-06-20,1960.449951,1963.910034,1959.170044,1962.869995,4336240000.0,1962.869995,1949.057495,1783.089842,1.093079,3286731000.0,43597320.0


In [62]:
new_cols + ['Date']

['Avg Close 5',
 'Avg Close 365',
 'Ratio Close 5 365',
 'Avg Vol 365',
 'Std Vol 365',
 'Date']