In [3]:
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path

In [108]:
# Load the closing prices 
file_path = Path('Data/df.csv')
df = pd.read_csv(file_path)
df = df.drop([1])
df.head()

Unnamed: 0,Attributes,Volume,Volume.1,Volume.2,Adj Close,Adj Close.1,Adj Close.2
0,Symbols,GSK,PFE,AZN,GSK,PFE,AZN
2,1/2/2018,9465500,16185800,6107400,32.10053253,32.92734146,31.99385643
3,1/3/2018,6600800,13456500,4195400,31.97884941,33.1713028,32.05715179
4,1/4/2018,5206400,12378100,3870900,32.03968811,33.24359894,32.1023674
5,1/5/2018,7250700,12492900,3336000,32.60468292,33.30685043,32.43695831


In [109]:
def construct_df(df, volume, adj_close):
    r_df = pd.DataFrame({"Volume": df[volume], "Adj Close": df[adj_close], 
                      "Date": df["Attributes"]})
    r_df.drop([0], inplace=True)
    r_df["Date"] = pd.to_datetime(r_df["Date"])
    r_df.drop(r_df.loc[r_df["Date"] <'2020-01-01'].index, inplace=True)
    r_df = r_df.set_index("Date")
    return r_df
    

In [110]:
# extract data for GSk and create a dataframe for GSk
gsk_df = construct_df(df, "Volume","Adj Close")
gsk_df.head()

Unnamed: 0_level_0,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,2462400,45.22977448
2020-01-03,2149100,44.80562592
2020-01-06,2034500,44.82490158
2020-01-07,1718900,44.54534912
2020-01-08,1766700,44.73814774


In [111]:
# extract data for PFE and create a dataframe for PFE
pfe_df = construct_df(df, "Volume.1","Adj Close.1")
pfe_df.head()

Unnamed: 0_level_0,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,15668000,37.99060822
2020-01-03,14158300,37.78677368
2020-01-06,14963900,37.73823929
2020-01-07,19077900,37.61206436
2020-01-08,15563100,37.91296005


In [112]:
# extract data for AZN and create a dataframe for AZN
azn_df = construct_df(df, "Volume.2","Adj Close.2")
azn_df.head()

Unnamed: 0_level_0,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,3587300,48.99202347
2020-01-03,1208700,48.7003479
2020-01-06,1992300,48.49617386
2020-01-07,1871900,48.68090057
2020-01-08,1869000,48.56423187


In [113]:
# azn_df.iloc[1:10]["Volume", "Adj Close"]
a = azn_df.iloc[0:10, 0:2].values.tolist()
a

[['3587300', '48.99202347'],
 ['1208700', '48.7003479'],
 ['1992300', '48.49617386'],
 ['1871900', '48.68090057'],
 ['1869000', '48.56423187'],
 ['1959000', '48.69062424'],
 ['1872900', '48.46700287'],
 ['2395900', '48.291996'],
 ['2230800', '48.78784943'],
 ['2489400', '49.39064407']]

In [114]:
# This function accepts the column number for the features (X) and the target (y)
# It chunks the data up with a rolling window of Xt-n to predict Xt
# It returns a numpy array of X any y
def window_data(df, window, start_feature_col_number, end_feature_col_number, target_col_number):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        features = df.iloc[i:(i + window), start_feature_col_number:end_feature_col_number].values.tolist()
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)

    return np.array(X), np.array(y).reshape(-1, 1)

In [116]:
def generate_data_set(df):
    # Predict Closing Prices using a 10 day window of previous closing prices of GSK
    window_size = 10
    # Column index 0 is the 'Volume' column
    # Column index 1 is the `Adj Close` column
    start_feature_column = 0
    end_feature_column = 2
    target_column = 1
    X, y = window_data(gsk_df, window_size, start_feature_column, end_feature_column, target_column)
    return X,y

In [117]:
# Generate data for training and testing out of gsk_df
gsk_X, gsk_y = generate_data_set(gsk_df)
print(gsk_y[:3])

[['45.46113205']
 ['46.16482925']
 ['45.78888321']]


In [118]:
# Generate data for training and testing out of pfe_df
pfe_X, pfe_y = generate_data_set(pfe_df)
print(pfe_y[:3])

[['45.46113205']
 ['46.16482925']
 ['45.78888321']]


In [119]:
# Generate data for training and testing out of azn_df
azn_X, azn_y = generate_data_set(azn_df)
print(azn_y[:3])

[['45.46113205']
 ['46.16482925']
 ['45.78888321']]


In [115]:
# Predict Closing Prices using a 10 day window of previous closing prices of GSK
window_size = 10
# Column index 0 is the 'Volume' column
# Column index 1 is the `Adj Close` column
start_feature_column = 0
end_feature_column = 2
target_column = 1
gsk_X, gsk_y = window_data(gsk_df, window_size, start_feature_column, end_feature_column, target_column)

In [None]:
# Use 70% of the data for training and the remainder for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]