In [2]:
import math
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import time

from datetime import date, datetime, time, timedelta
from matplotlib import pyplot as plt
from pylab import rcParams
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

%matplotlib inline

#### Input params ##################
stk_path = "./coding_work_competition/amzn_data.csv"
test_size = 0.2                 # proportion of dataset to be used as test set
cv_size = 0.2                   # proportion of dataset to be used as cross-validation set
Nmax = 30                       # for feature at day t, we use lags from t-1, t-2, ..., t-N as features
                                # Nmax is the maximum N we are going to test
fontsize = 14
ticklabelsize = 14

In [3]:
def get_preds_lin_reg(df, target_col, N, pred_min, offset):
    """
    Given a dataframe, get prediction at timestep t using values from t-1, t-2, ..., t-N.
    Inputs
        df         : dataframe with the values you want to predict. Can be of any length.
        target_col : name of the column you want to predict e.g. 'adj_close'
        N          : get prediction at timestep t using values from t-1, t-2, ..., t-N
        pred_min   : all predictions should be >= pred_min
        offset     : for df we only do predictions for df[offset:]. e.g. offset can be size of training set
    Outputs
        pred_list  : the predictions for target_col. np.array of length len(df)-offset.
    """
    # Create linear regression object
    regr = LinearRegression(fit_intercept=True)

    pred_list = []

    for i in range(offset, len(df['adj_close'])):
        X_train = np.array(range(len(df['adj_close'][i-N:i]))) # e.g. [0 1 2 3 4]
        y_train = np.array(df['adj_close'][i-N:i]) # e.g. [2944 3088 3226 3335 3436]
        X_train = X_train.reshape(-1, 1)     # e.g X_train = 
                                             # [[0]
                                             #  [1]
                                             #  [2]
                                             #  [3]
                                             #  [4]]
        # X_train = np.c_[np.ones(N), X_train]              # add a column
        y_train = y_train.reshape(-1, 1)
    #     print X_train.shape
    #     print y_train.shape
    #     print 'X_train = \n' + str(X_train)
    #     print 'y_train = \n' + str(y_train)
        regr.fit(X_train, y_train)            # Train the model
        pred = regr.predict(np.array(N).reshape(1,-1))
    
        pred_list.append(pred[0][0])  # Predict the footfall using the model
    
    # If the values are < pred_min, set it to be pred_min
    pred_list = np.array(pred_list)
    pred_list[pred_list < pred_min] = pred_min
        
    return pred_list

def get_mape(y_true, y_pred): 
    """
    Compute mean absolute percentage error (MAPE)
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [13]:
df = pd.read_csv(stk_path, sep = ",")

# Convert Date column to datetime
df.loc[:, 'Date'] = df['stamp']

# Change all column headings to be lower case, and remove spacing
df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]

# Sort by datetime
df.sort_values(by='stamp', inplace=True, ascending=True)

df.head(10)

Unnamed: 0,unnamed:_0,unnamed:_0.1,date,minute,label,high,low,open,close,average,...,numberoftrades,symbol,year,month,day,hour,min,min1,stamp,date.1
12090,0,0,2020-05-26,09:30,09:30 AM,2461.34,2455.12,2456.08,2455.12,2459.481,...,24,AMZN,2020,5,26,9,30,30,2020-05-26 09:30:00,2020-05-26 09:30:00
12091,1,1,2020-05-26,09:31,09:31 AM,2456.23,2449.98,2456.07,2450.3,2451.517,...,9,AMZN,2020,5,26,9,31,31,2020-05-26 09:31:00,2020-05-26 09:31:00
12092,2,2,2020-05-26,09:32,09:32 AM,2448.18,2445.585,2448.18,2447.01,2446.583,...,6,AMZN,2020,5,26,9,32,32,2020-05-26 09:32:00,2020-05-26 09:32:00
12093,3,3,2020-05-26,09:33,09:33 AM,2449.76,2446.58,2446.885,2449.76,2448.111,...,6,AMZN,2020,5,26,9,33,33,2020-05-26 09:33:00,2020-05-26 09:33:00
12094,4,4,2020-05-26,09:34,09:34 AM,2450.405,2445.3,2449.82,2445.3,2449.583,...,8,AMZN,2020,5,26,9,34,34,2020-05-26 09:34:00,2020-05-26 09:34:00
12095,5,5,2020-05-26,09:35,09:35 AM,2448.31,2445.33,2448.31,2445.33,2445.417,...,3,AMZN,2020,5,26,9,35,35,2020-05-26 09:35:00,2020-05-26 09:35:00
12096,6,6,2020-05-26,09:36,09:36 AM,2441.74,2441.645,2441.645,2441.74,2441.698,...,3,AMZN,2020,5,26,9,36,36,2020-05-26 09:36:00,2020-05-26 09:36:00
12097,7,7,2020-05-26,09:37,09:37 AM,2446.73,2444.3,2444.3,2446.73,2446.664,...,5,AMZN,2020,5,26,9,37,37,2020-05-26 09:37:00,2020-05-26 09:37:00
12098,8,8,2020-05-26,09:38,09:38 AM,2450.965,2448.56,2448.56,2450.965,2450.793,...,3,AMZN,2020,5,26,9,38,38,2020-05-26 09:38:00,2020-05-26 09:38:00
12099,9,9,2020-05-26,09:39,09:39 AM,2453.56,2451.47,2451.47,2453.205,2452.506,...,9,AMZN,2020,5,26,9,39,39,2020-05-26 09:39:00,2020-05-26 09:39:00
