In [1]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np

from pca import PCAPlotting

1. import merged data
2. remove regions that contain >= 85% of data being NaN
3. fill all NaNs with 0s

In [None]:
from data import Data
data = Data()
data.filter_low_data_regions(threshold=0.15)
data.fill_na(0)
dat = data.d
print(dat['open_covid_region_code'].unique())

`region_validation_splits` generates test splits grouped by regions

In [None]:
def region_validation_splits(d, n_splits=5):
    regions = d["open_covid_region_code"].unique()
    kf = KFold(n_splits=n_splits)
    for train_i, val_i in kf.split(regions):
        train = regions[train_i]
        val = regions[val_i]
        dat_train = d[d["open_covid_region_code"].isin(train)]
        dat_val = d[d["open_covid_region_code"].isin(val)]
        x_train = dat_train.iloc[:, 6:-1]
        y_train = dat_train.iloc[:, -1]
        x_val = dat_val.iloc[:, 6:-1]
        y_val = dat_val.iloc[:, -1]
        yield x_train, y_train, x_val, y_val

`time_validation_split` splits the data into two sets: before and after a given time.

In [None]:
def time_validation_split(d, split_date):
    dat_train = d[d["date"] <= split_date]
    dat_val = d[d["date"] > split_date]
    x_train = dat_train.iloc[:, 6:-1]
    y_train = dat_train.iloc[:, -1]
    x_val = dat_val.iloc[:, 6:-1]
    y_val = dat_val.iloc[:, -1]
    yield x_train, y_train, x_val, y_val

Mean Squared Error

In [None]:
def calc_MSE(model, validation_sets):
    squared_error = 0
    n_validations = 0
    for x_train, y_train, x_val, y_val in validation_sets:
        model.fit(x_train, y_train)
        pred = model.predict(x_val)
        squared_error += sum((pred - y_val) ** 2)
        n_validations += x_val.shape[0]
    return squared_error / n_validations

KNN performance on region cross validation. This outputs the best K and it's corresponding
mean squared error.

In [None]:
n_neighbors = np.zeros(100)
for i in range(n_neighbors.shape[0]):
    neigh = KNeighborsRegressor(n_neighbors=i+1)
    n_neighbors[i] = calc_MSE(neigh, region_validation_splits(dat, n_splits=5))
best = np.argmin(n_neighbors) + 1
print(f"best K = {best}, MSE = {n_neighbors[best - 1]}")

Regression Tree performance on region cross validation. It uses minimum samples per leaf
as a hyperparameter. This outputs the best min_sample_leaf and it's corresponding
mean squared error.

In [None]:
leaf_samples = np.zeros(100)
for i in range(leaf_samples.shape[0]):
    tree = DecisionTreeRegressor(min_samples_leaf=i+1)
    leaf_samples[i] = calc_MSE(tree, region_validation_splits(dat, n_splits=5))
best = np.argmin(leaf_samples) + 1
print(f"best min_samples_leaf = {best}, MSE = {leaf_samples[best - 1]}")

KNN performance on time split validation. This outputs the best K and it's corresponding
mean squared error.

In [None]:
n_neighbors = np.zeros(200)
for i in range(n_neighbors.shape[0]):
    neigh = KNeighborsRegressor(n_neighbors=i+1)
    n_neighbors[i] = calc_MSE(neigh, time_validation_split(dat, "2020-08-10"))
best = np.argmin(n_neighbors) + 1
print(f"best K = {best}, MSE = {n_neighbors[best - 1]}")

Regression Tree performance on time split validation. It uses minimum samples per leaf
as a hyperparameter. This outputs the best min_sample_leaf and it's corresponding
mean squared error.

In [None]:
leaf_samples = np.zeros(200)
for i in range(leaf_samples.shape[0]):
    tree = DecisionTreeRegressor(min_samples_leaf=i+1)
    leaf_samples[i] = calc_MSE(tree, time_validation_split(dat, "2020-08-10"))
best = np.argmin(leaf_samples) + 1
print(f"best min_samples_leaf = {best}, MSE = {leaf_samples[best - 1]}")

This is an attempt at using a model on the PCA-reduced dataset and ordinary 5-fold
cross validation is used.

In [None]:
pca = PCAPlotting(dat)
pca.reduce_dimensionality(new_dimension=2)
pca.add_hospitalized_new()
X = pca.reduced_data[:,:-1]
y = pca.reduced_data[:,-1]
kf = KFold(n_splits=5)

mse = np.zeros(200)
for i in range(mse.shape[0]):
    sqr_err = 0
    n_test = 0
    for train_i, test_i in kf.split(X):
        # model = KNeighborsRegressor(n_neighbors=i+1)
        model = DecisionTreeRegressor(min_samples_leaf=i+1)
        model.fit(X[train_i], y[train_i])
        pred = model.predict(X[test_i])
        sqr_err += np.sum((pred - y[test_i]) ** 2)
        n_test += test_i.shape[0]
    mse[i] = sqr_err / n_test
best = np.argmin(mse) + 1
print(f"best = {best}, MSE = {mse[best - 1]}")


pca = PCAPlotting(dat)
pca.reduce_dimensionality(new_dimension=2)
pca.add_hospitalized_new()
X = pca.reduced_data[:,:-1]
y = pca.reduced_data[:,-1]
kf = KFold(n_splits=5)

mse = np.zeros(200)
for i in range(mse.shape[0]):
    sqr_err = 0
    n_test = 0
    for train_i, test_i in kf.split(X):
        # model = KNeighborsRegressor(n_neighbors=i+1)
        model = DecisionTreeRegressor(min_samples_leaf=i+1)
        model.fit(X[train_i], y[train_i])
        pred = model.predict(X[test_i])
        sqr_err += np.sum((pred - y[test_i]) ** 2)
        n_test += test_i.shape[0]
    mse[i] = sqr_err / n_test
best = np.argmin(mse) + 1
print(f"best = {best}, MSE = {mse[best - 1]}")


In [6]:
n_neighbors = np.zeros(100)
for i in range(n_neighbors.shape[0]):
    neigh = KNeighborsRegressor(n_neighbors=i+1)
    n_neighbors[i] = calc_MSE(neigh, region_validation_splits(dat, n_splits=5))
best = np.argmin(n_neighbors) + 1
print(f"best K = {best}, MSE = {n_neighbors[best - 1]}")

best K = 100, MSE = 2392.578654473684


Regression Tree performance on region cross validation. It uses minimum samples per leaf
as a hyperparameter. This outputs the best min_sample_leaf and it's corresponding
mean squared error.

In [7]:
leaf_samples = np.zeros(100)
for i in range(leaf_samples.shape[0]):
    tree = DecisionTreeRegressor(min_samples_leaf=i+1)
    leaf_samples[i] = calc_MSE(tree, region_validation_splits(dat, n_splits=5))
best = np.argmin(leaf_samples) + 1
print(f"best min_samples_leaf = {best}, MSE = {leaf_samples[best - 1]}")

best min_samples_leaf = 28, MSE = 2277.33822027266


KNN performance on time split validation. This outputs the best K and it's corresponding
mean squared error.

In [52]:
n_neighbors = np.zeros(200)
for i in range(n_neighbors.shape[0]):
    neigh = KNeighborsRegressor(n_neighbors=i+1)
    n_neighbors[i] = calc_MSE(neigh, time_validation_split(dat, "2020-08-10"))
best = np.argmin(n_neighbors) + 1
print(f"best K = {best}, MSE = {n_neighbors[best - 1]}")

best K = 1, MSE = 972.1166666666667


Regression Tree performance on time split validation. It uses minimum samples per leaf
as a hyperparameter. This outputs the best min_sample_leaf and it's corresponding
mean squared error.

In [50]:
leaf_samples = np.zeros(200)
for i in range(leaf_samples.shape[0]):
    tree = DecisionTreeRegressor(min_samples_leaf=i+1)
    leaf_samples[i] = calc_MSE(tree, time_validation_split(dat, "2020-08-10"))
best = np.argmin(leaf_samples) + 1
print(f"best min_samples_leaf = {best}, MSE = {leaf_samples[best - 1]}")

best min_samples_leaf = 3, MSE = 1061.9392407407408


This is an attempt at using a model on the PCA-reduced dataset and ordinary 5-fold
cross validation is used.

In [49]:
pca = PCAPlotting(dat)
pca.reduce_dimensionality(new_dimension=2)
pca.add_hospitalized_new()
X = pca.reduced_data[:,:-1]
y = pca.reduced_data[:,-1]
kf = KFold(n_splits=5)

mse = np.zeros(200)
for i in range(mse.shape[0]):
    sqr_err = 0
    n_test = 0
    for train_i, test_i in kf.split(X):
        # model = KNeighborsRegressor(n_neighbors=i+1)
        model = DecisionTreeRegressor(min_samples_leaf=i+1)
        model.fit(X[train_i], y[train_i])
        pred = model.predict(X[test_i])
        sqr_err += np.sum((pred - y[test_i]) ** 2)
        n_test += test_i.shape[0]
    mse[i] = sqr_err / n_test
best = np.argmin(mse) + 1
print(f"best = {best}, MSE = {mse[best - 1]}")


best = 153, MSE = 2177.7346649930746
