In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import datetime
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [47]:
df = pd.read_csv('GOOGL.csv')
df.set_index('Date',inplace=True)
df.index = pd.to_datetime(df.index)
df['HL_Change'] = (df['High'] - df['Low'])/ df['High'] * 100.0
df['PCT_Change'] = (df['Close'] - df['Open'])/ df['Open'] * 100.0
print(df.head(10))
df = df[['Adj Close','HL_Change','PCT_Change','Volume']]
df.tail()

                   Open         High          Low        Close    Adj Close  \
Date                                                                          
2020-05-26  1441.959961  1445.109985  1419.400024  1421.369995  1421.369995   
2020-05-27  1420.000000  1425.349976  1394.599976  1420.280029  1420.280029   
2020-05-28  1400.000000  1444.459961  1399.079956  1418.239990  1418.239990   
2020-05-29  1420.430054  1436.729980  1415.979980  1433.520020  1433.520020   
2020-06-01  1425.699951  1441.579956  1422.280029  1434.869995  1434.869995   
2020-06-02  1435.000000  1443.000000  1421.609985  1442.310059  1442.310059   
2020-06-03  1442.699951  1449.010010  1431.619995  1439.250000  1439.250000   
2020-06-04  1436.780029  1441.319946  1406.010010  1414.300049  1414.300049   
2020-06-05  1415.640015  1446.300049  1407.619995  1440.020020  1440.020020   
2020-06-08  1426.280029  1449.000000  1424.479980  1448.040039  1448.040039   

             Volume  HL_Change  PCT_Change  
Date  

Unnamed: 0_level_0,Adj Close,HL_Change,PCT_Change,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-05-17,2288.919922,1.496651,1.119906,1079900
2021-05-18,2262.469971,1.971244,-1.732149,1110200
2021-05-19,2271.5,2.178034,1.932754,1184200
2021-05-20,2306.949951,1.231833,0.6962,1617600
2021-05-21,2294.129883,1.276035,-1.004571,2030700


In [48]:
def prepare_data(df,forecast_col,forecast_out):
    df['label'] = df[forecast_col].shift(-forecast_out)
    X = df.drop(['label'], axis=1)
    X = preprocessing.scale(X)
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out,:]
    df.dropna(inplace=True)
    print("Head\n",df.head())
    print("Tail\n",df.tail())
    y = df['label']
    print(y)
    print("X Lately\n",X_lately)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    response = X_train, X_test, y_train, y_test,X_lately
    return response

In [49]:
forecast_out = 7
forecast_col = 'Adj Close'
X_train, X_test, y_train, y_test,X_lately = prepare_data(df,forecast_col,forecast_out)

Head
               Adj Close  HL_Change  PCT_Change   Volume        label
Date                                                                
2020-05-26  1421.369995   1.779101   -1.427915  2229500  1414.300049
2020-05-27  1420.280029   2.157365    0.019720  1584200  1440.020020
2020-05-28  1418.239990   3.141659    1.302856  1758500  1448.040039
2020-05-29  1433.520020   1.444252    0.921549  1852200  1452.079956
2020-06-01  1434.869995   1.338804    0.643196  1258100  1464.699951
Tail
               Adj Close  HL_Change  PCT_Change   Volume        label
Date                                                                
2021-05-06  2337.350098   1.895739    1.344995  1259500  2288.919922
2021-05-07  2351.929932   1.058095   -0.505944  1444500  2262.469971
2021-05-10  2291.750000   1.993134   -1.563046  1545700  2271.500000
2021-05-11  2270.060059   2.033088    1.203261  1709300  2306.949951
2021-05-12  2200.250000   2.418165   -1.165217  2318800  2294.129883
Date
2020-05-26    141

In [50]:
print(X_train.shape,y_train.shape, X_test.shape, y_test.shape)
print(df.tail(10))

(183, 4) (183,) (61, 4) (61,)
              Adj Close  HL_Change  PCT_Change   Volume        label
Date                                                                
2021-04-29  2392.760010   1.260724    0.147748  2061700  2291.750000
2021-04-30  2353.500000   1.481623   -0.613171  2242000  2270.060059
2021-05-03  2343.080078   1.933617   -0.923073  1412100  2200.250000
2021-05-04  2306.830078   2.938080   -0.290461  2240900  2229.040039
2021-05-05  2314.770020   1.144324   -0.595194  1331800  2278.379883
2021-05-06  2337.350098   1.895739    1.344995  1259500  2288.919922
2021-05-07  2351.929932   1.058095   -0.505944  1444500  2262.469971
2021-05-10  2291.750000   1.993134   -1.563046  1545700  2271.500000
2021-05-11  2270.060059   2.033088    1.203261  1709300  2306.949951
2021-05-12  2200.250000   2.418165   -1.165217  2318800  2294.129883


In [51]:
lm_model = LinearRegression()
lm_model.fit(X_train,y_train)

LinearRegression()

In [52]:
r2_score = lm_model.score(X_test,y_test)

In [53]:
print(r2_score)

0.9358226682051908


In [54]:
preds = lm_model.predict(X_lately)

In [55]:
print(preds)

[2247.32474146 2293.70432936 2295.21123662 2272.70939728 2276.60280319
 2323.28767001 2319.97485613]


In [56]:
df['forecast'] = np.nan
last_date = df.iloc[-1].name
print(last_date)
last_unix = last_date.timestamp()
print("Lastdate and last unix us ",last_date,last_unix)
one_day = 86400
next_unix = last_unix + one_day
print("Before\n",df.tail(5))
for i in preds:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    print(next_date)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]
print("After\n",df.tail(5))
print(df.head(5))



2021-05-12 00:00:00
Lastdate and last unix us  2021-05-12 00:00:00 1620777600.0
Before
               Adj Close  HL_Change  PCT_Change   Volume        label  forecast
Date                                                                          
2021-05-06  2337.350098   1.895739    1.344995  1259500  2288.919922       NaN
2021-05-07  2351.929932   1.058095   -0.505944  1444500  2262.469971       NaN
2021-05-10  2291.750000   1.993134   -1.563046  1545700  2271.500000       NaN
2021-05-11  2270.060059   2.033088    1.203261  1709300  2306.949951       NaN
2021-05-12  2200.250000   2.418165   -1.165217  2318800  2294.129883       NaN
2021-05-13 05:30:00
2021-05-14 05:30:00
2021-05-15 05:30:00
2021-05-16 05:30:00
2021-05-17 05:30:00
2021-05-18 05:30:00
2021-05-19 05:30:00
After
                      Adj Close  HL_Change  PCT_Change  Volume  label  \
Date                                                                   
2021-05-15 05:30:00        NaN        NaN         NaN     NaN    NaN