In [4]:
import numpy as np

def add_intercept(X):
    n = X.shape[0]
    intercept = np.ones((n,1))
    return np.hstack([intercept,X])

def ols_fit(X,y):
    X = np.asarray(X)
    y = np.asarray(y).reshape(-1,1)

    X_design = add_intercept(X)
    XtX= X_design.T @ X_design
    Xty = X_design.T @ y

    beta_hat = np.linalg.solve(XtX,Xty)
    return beta_hat.ravel()

In [5]:
import numpy as np

def ridge_regression(X, y, lam, fit_intercept=True):
    X = np.asarray(X, dtype=float)
    y = np.asarray(y, dtype=float).reshape(-1)
    n, p = X.shape

    if fit_intercept:
        X_mean = X.mean(axis=0)
        y_mean = y.mean()

        Xc = X - X_mean
        yc = y - y_mean
    else:
        X_mean = np.zeros(p)
        y_mean = 0.0
        Xc = X
        yc = y

    XtX = Xc.T @ Xc             
    A = XtX + lam * np.eye(p)    


    Xty = Xc.T @ yc          


    beta = np.linalg.solve(A, Xty)

    beta0 = y_mean - np.dot(X_mean, beta) if fit_intercept else 0.0

    return beta0, beta


In [19]:
# Fake data
rng = np.random.default_rng(0)
n, p = 1000, 5
X = rng.normal(size=(n, p))
true_beta = np.array([3.0, 0.0, -2.0, 0.0, 0.0])
y = X @ true_beta + 3.0 + rng.normal(scale=1.0, size=n)  # intercept = 3

lam = 10.0  # try different values: 0 (OLS), 1, 10, 100
beta0_hat, beta_hat = ridge_regression(X, y, lam)

print("Estimated intercept:", beta0_hat)
print("Estimated beta:     ", beta_hat)

Estimated intercept: 3.012298611642308
Estimated beta:      [ 2.95140621  0.05151579 -1.96218561 -0.01272196 -0.01844992]


In [20]:
import numpy as np

def soft_threshold(z, lam):
    """Soft-thresholding operator S(z, lam)."""
    if z > lam:
        return z - lam
    elif z < -lam:
        return z + lam
    else:
        return 0.0

def lasso_coordinate_descent(X, y, lam, max_iter=1000, tol=1e-6):
    """
    LASSO regression via coordinate descent.

    Objective:
        (1/(2n)) * ||y - beta0 - X beta||^2 + lam * ||beta||_1

    Parameters
    ----------
    X : (n, p) array_like
        Predictor matrix.
    y : (n,) or (n, 1) array_like
        Response vector.
    lam : float
        L1 penalty (lambda).
    max_iter : int
        Maximum number of coordinate descent sweeps.
    tol : float
        Convergence tolerance on coefficients.

    Returns
    -------
    beta0 : float
        Intercept term.
    beta : (p,) numpy array
        Coefficient vector.
    """
    X = np.asarray(X, dtype=float)
    y = np.asarray(y, dtype=float).reshape(-1)

    n, p = X.shape

    # ----- Standardize X and center y -----
    # Means and std for each column
    X_mean = X.mean(axis=0)
    X_std = X.std(axis=0, ddof=0)

    # Avoid divide-by-zero if a column is constant
    X_std[X_std == 0.0] = 1.0

    Xs = (X - X_mean) / X_std  # standardized predictors
    y_mean = y.mean()
    ys = y - y_mean            # centered response

    # We’ll fit ys ~ Xs beta (intercept is 0 in standardized space)
    # Intercept is recovered afterwards from means.

    # Precompute squared norms of columns of Xs
    X_col_sqnorm = np.sum(Xs**2, axis=0)  # shape (p,)

    # Initialize coefficients
    beta = np.zeros(p)
    # Residual r = ys - Xs @ beta  (start with beta=0 => r = ys)
    r = ys.copy()

    for it in range(max_iter):
        beta_old = beta.copy()

        for j in range(p):
            # Add back current feature's contribution to residual
            # r currently = ys - Xs @ beta
            # We want partial residual excluding feature j:
            # r_j = ys - sum_{k != j} Xs[:,k] beta[k]
            #     = r + Xs[:,j] * beta[j]
            r_j = r + Xs[:, j] * beta[j]

            # Compute rho_j = sum_i X_ij * r_j_i
            rho_j = np.dot(Xs[:, j], r_j)

            # Coordinate update with soft-thresholding
            # Note: no 1/n here; you can absorb that into lam if desired.
            beta_j_new = soft_threshold(rho_j, lam) / X_col_sqnorm[j]

            # Update residual to reflect new beta_j
            r = r_j - Xs[:, j] * beta_j_new
            beta[j] = beta_j_new

        # Check convergence
        max_change = np.max(np.abs(beta - beta_old))
        if max_change < tol:
            break

    # Un-standardize coefficients and recover intercept
    # ys = y - y_mean, Xs = (X - X_mean) / X_std
    # model in original scale: y ≈ beta0 + X @ beta_orig
    beta_orig = beta / X_std
    beta0 = y_mean - np.dot(beta_orig, X_mean)

    return beta0, beta_orig
# Fake data with some true sparse beta
rng = np.random.default_rng(0)
n, p = 200, 5

X = rng.normal(size=(n, p))
true_beta = np.array([3.0, 0.0, -2.0, 0.0, 0.0])
y = X @ true_beta + rng.normal(scale=1.0, size=n)

lam = 10
beta0_hat, beta_hat = lasso_coordinate_descent(X, y, lam)

print("Intercept:", beta0_hat)
print("Coefficients:", beta_hat)


Intercept: 0.041401729930683806
Coefficients: [ 3.01549672e+00 -2.70205187e-02 -1.98687045e+00  1.45686264e-03
  4.21062650e-02]


In [23]:
def expectedvalue(j):
    if j == 1:
        return 3.5
    return 1/6*max(expectedvalue(j-1),6)+1/6*max(expectedvalue(j-1),1)+1/6*max(expectedvalue(j-1),5)+1/6*max(expectedvalue(j-1),4)+1/6*max(expectedvalue(j-1),3)+1/6*max(expectedvalue(j-1),2)
    

In [33]:
expectedvalue(5)

5.12962962962963

In [28]:
def expectedvalue(j):
    ev = 3.5  # base case for j = 1
    for _ in range(2, j + 1):
        ev = sum(max(ev, face) for face in range(1, 7)) / 6
    return ev