# Importing Modules

In [30]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import random

# Defining Functions for Multiple Linear Regression

## Functions to Prepare DataFrame

In [4]:
def get_features_targets(df, feature_names, target_names):
    # used to get columns referring to features and target from a DataFrame
    # feature_names and target_names must be lists
    df_feature = df[feature_names]
    df_target = df[target_names]
    return df_feature, df_target

def normalize_z(df):
    # used to normalize features to allow computation of large data while maintaining relation ship with target variable
    data = (df- df.mean())/df.std()
    return data

def prepare_feature(df_feature):
    # adds a column of 1's and converts DataFrame to a np array to do matrix operations
    ones = np.ones((df_feature.shape[0],1))
    np_feature = np.concatenate((ones, df_feature), axis = 1)
    return np_feature

def prepare_target(df_target):
    #converts df_target to np array
    return df_target.values

def split_data(df_feature, df_target, random_state=None, test_size=0.5):
    #split df_feature, df_target to 2 sets for training and testing
    indexes = df_feature.index
    if random_state != None:
        np.random.seed(random_state)
    k = int(test_size * len(indexes))
    test_index = np.random.choice(indexes,k, replace = False)
    indexes = set(indexes)
    test_index = set(test_index)
    train_index = indexes - test_index
    df_feature_train = df_feature.loc[train_index, :]
    df_feature_test = df_feature.loc[test_index, :]
    df_target_train = df_target.loc[train_index,:]
    df_target_test = df_target.loc[test_index, :]
    return df_feature_train, df_feature_test, df_target_train, df_target_test

## Functions to apply Regression

In [5]:
def compute_cost(X, y, beta):
    # computes the value of cost function (Average mean squared error)
    J = 1/(2* len(X)) * np.sum(np.power(((X @ beta)- y), 2))
    return J

def gradient_descent(X, y, beta, alpha, num_iters):
    # carrys out gradient descent to minimize cost function
    J_storage = np.zeros(num_iters)
    for i in range(num_iters):
        beta = beta - (alpha/len(X)) * (X.T @ (X @ beta - y))
        J_storage[i] = compute_cost(X, y, beta)
    return beta, J_storage

## Functions to evaluate model

In [6]:
def predict(df_feature, beta):
    # returns values of target_pred for a given test set of features
    X_df = normalize_z(df_feature)
    X = prepare_feature(X_df)
    return predict_norm(X, beta)

def predict_norm(X, beta):
    # returns values of target_pred for a given test set of normalized features
    return X @ beta

def r2_score(y, ypred):
    #calculates r2_score to determine strength of correlation
    SS_res = np.sum(np.power((y - ypred), 2))
    SS_tot = np.sum(np.power((y - y.mean()), 2))
    return 1- SS_res/SS_tot

def mean_squared_error(target, pred):
    #returns sum of mean squared error
    return np.sum(np.power((target - pred), 2))/len(target)

# Validating Models

## Importing and Cleaning Data

In [60]:
#importing csv
df = pd.read_csv('archive\covid19_sg.csv', parse_dates = ['Date'])

#selecting required data
df_select = df.loc[500:,:]
df_select2 = df_select.copy()
df_select2['days'] = df_select.loc[:,'Date'].apply(lambda x: (x- df_select.iloc[0,0]).days)

# Getting columns corresponding to features and targets
df_feature, df_target = get_features_targets(df_select2, ['days'] ,['Cumulative Confirmed'])

## Preparing Features and targets