# Coding Assignment 3

CS 598 Practical Statistical Learning

2023-10-09

UIUC Fall 2023

**Authors**
* Ryan Fogle
    - rsfogle2@illinois.edu
    - UIN: 652628818
* Sean Enright
    - seanre2@illinois.edu
    - UIN: 661791377

**Contributions**

TODO

## Part I

Here we implement LOO-CV and GCV to select the optimal span for LOESS.

In [None]:
# General imports
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Part 1 imports
from csaps import csaps
from skmisc.loess import loess

In [None]:
# Manual calculation of smoothing matrix
def smooth(x, y, fx, lam, axis=-1):
    """Fits a cubic spline to a given set of points, parameterized by lambda.

    Args:
        x (np.ndarray): Observations vector
        y (np.ndarray): Response vector
        fx (np.ndarray): Data sites for output smoothed data
        lam (float): Smoothing parameter (lambda)
        axis (np.ndarray): Axis along which y data varies

    Returns:
        (np.ndarray): Smoothed y data
    """
    p = 1 / (lam + 1) # CSAPS parameterizes by p, rather than R's lambda
    return csaps(x, y, fx, smooth=p, axis=axis)

def S_lam(x, lam):
    """Calculate the smoothing spline matrix for a vector observations and lambda value.

    Args:
        x (np.ndarray): Vector of observations
        lam (float): Smoothing parameter

    Returns:
        (np.ndarray): n x n smoothing matrix
    """
    n = len(x)
    Y = np.identity(n)
    A = smooth(x, Y, x, lam, axis=0)
    return (A + A.T) / 2

def lo_lev(x, sp):
    # Calculate diagonal entries of S, the smoothing matrix
    pass

In [None]:
def onestep_cv(x, y, sp):
    # 1) Fit a LOESS model y - x with span and extract the
    #    corresponding residual vector
    loess_fit = loess(x, y, span=sp)
    y_hat = loess_fit.predict(x).values
    # 2) Call lo_lev to obtain the diagonal entries of S
    s_ii = loess_fit.outputs.diagonal
    #s_ii = lo_lev(x, sp)
    # 3) Compute LOO-CV and GCV
    # LOOCV
    loocv = np.mean(np.power((y - y_hat) / (1 - s_ii), 2))
    # GCV
    m = np.mean(s_ii)
    gcv = np.mean(np.power((y - y_hat) / (1 - m), 2))
    return loocv, gcv

def find_cv_vals(x, y, span):
    m = len(span)
    cv = np.zeros(m)
    gcv = np.zeros(m)

    for i in range(m):
        cv_i, gcv_i = onestep_cv(x, y, span[i])
        cv[i] = cv_i
        gcv[i] = gcv_i
    return cv, gcv

Determining span values that produce the lowest LOOCV and GCV error.

In [None]:
# https://liangfgithub.github.io/Data/Coding3_Data.csv
data_part1 = pd.read_csv("Coding3_Data.csv")
span_vec = np.linspace(0.2, 0.9, 15)

# Find optimal span by LOOCV and GCV
loo, gcv = find_cv_vals(data_part1["x"], data_part1["y"], span_vec)

# Display table of CV results
print("Span    LOOCV   GCV")
for s, l, g in zip(span_vec, loo, gcv):
    print(f"{s:.2f}\t{l:.3f}\t{g:.3f}")

The span optimization results are presented in the chart below.

In [None]:
sns.set()
mpl.rcParams['figure.dpi'] = 300
plt.scatter(span_vec, loo, color="darkorange", s=5, label="LOO-CV")
plt.plot(span_vec, loo, color="orange", alpha=1, linestyle="dotted")
plt.scatter(span_vec, gcv, color="blue", s=5, label="GCV")
plt.plot(span_vec, gcv, color="lightblue", alpha=0.75, linestyle="--")
plt.xlabel("Span")
plt.ylabel("CV Error")
plt.title("Span vs CV Error")
plt.legend()

For this dataset and choice of span values, the best span value selected by LOOCV and GCV is the same.

In [None]:
# Select lowest span value
span_loocv = span_vec[np.argmin(loo)]
span_gcv = span_vec[np.argmin(gcv)]

print(f"Span by LOO-CV: {span_loocv}")
print(f"   Span by GCV: {span_gcv}")

The true curve is defined below.

In [None]:
def f(x):
    return np.sin(12 * (x + 0.2)) / (x + 0.2)

fx = np.linspace(min(data_part1["x"]), max(data_part1["x"]), 1001)
fy = f(fx)

Finally, we compare the LOESS curve with LOO-CV and GCV optimized span to the true curve.

In [None]:
y_loess = loess(data_part1["x"], data_part1["y"], span=span_loocv).predict(fx).values

sns.set()
mpl.rcParams['figure.dpi'] = 300
plt.scatter(data_part1["x"], data_part1["y"], color="red", s=6)
plt.plot(fx, fy, color="gray", linewidth=1, label="True Function")
plt.plot(fx, y_loess, color="blue", linewidth=1, linestyle="--", label="LOESS Fit")
plt.legend()
plt.xlabel("X")
plt.ylabel("Y")

## Part II



In this part we use the Sales_Transactions_Dataset_Weekly dataset from the UCI Machine Learning Repository.

In [None]:
# https://archive.ics.uci.edu/dataset/396/sales+transactions+dataset+weekly
data_part2 = pd.read_csv("Sales_Transactions_Dataset_Weekly.csv",
                         index_col=0, usecols=range(53))
# Normalize each time series, i.e., normalize each row by its mean
data_part2 = data_part2.sub(data_part2.mean(axis=1), axis=0)
data_part2.shape