Some ideas related to [Y-aware PCA](https://win-vector.com/2022/09/08/y-aware-pca/) (and L2 regression), around y-scaling.

In [1]:
import os

os.chdir('/Users/johnmount/Documents/work/pyvtreat/pkg')

In [2]:
import sklearn.linear_model
import sklearn.metrics
import sklearn.decomposition
import sklearn.preprocessing
import sklearn.pipeline

In [3]:
import numpy as np
import pandas as pd
import vtreat.effect_scaler

In [4]:
rng = np.random.default_rng()

In [5]:
def mk_example(
    *,
    rng,
    n_rows: int,
    n_cols: int,
    p: float,
):
    """
    Make a pure noise example y ~ 0 to investigate over fit.

    :rng: numpy pseudo-random generator (example: np.random.default_rng())
    :param n_rows: number of example rows
    :param n_cols: number of example columns
    :param p: intensity of binomial explanatory variables
    :return: example data frame
    """
    n_rows = int(n_rows)
    assert n_rows > 0
    n_cols = int(n_cols)
    assert n_cols > 0
    p = float(p)
    assert p > 0
    assert p < 1
    d = pd.DataFrame({
        f'x_{i:05d}': rng.binomial(n=1, size=n_rows, p=p)
        for i in range(n_cols)
    })
    d['y'] = rng.normal(size=n_rows)
    return d


In [6]:
 n_rows = 1000
 n_cols = 10000
 p = 1.0 / n_cols

In [7]:
d_train = mk_example(rng=rng, n_rows=n_rows, n_cols=n_cols, p=p)
d_test = mk_example(rng=rng, n_rows=(10 * n_rows), n_cols=n_cols, p=p)

In [8]:
vars = [c for c in d_train.columns if c != 'y']


In [9]:
def eval_model(model):
    model.fit(d_train.loc[:, vars], d_train['y'])
    pred_train = model.predict(d_train.loc[:, vars])
    rmse_train = sklearn.metrics.mean_squared_error(
        y_true = d_train['y'],
        y_pred = pred_train,
    )
    pred_test = model.predict(d_test.loc[:, vars])
    rmse_test = sklearn.metrics.mean_squared_error(
        y_true = d_test['y'],
        y_pred = pred_test,
    )
    return rmse_train, rmse_test

In [10]:
ideal_train_error = sklearn.metrics.mean_squared_error(
        y_true = d_train['y'],
        y_pred = np.zeros(d_train.shape[0]),
    )

In [11]:
ideal_train_error


1.0258123968414175

In [12]:
ideal_test_error = sklearn.metrics.mean_squared_error(
        y_true = d_test['y'],
        y_pred = np.zeros(d_test.shape[0]),
    )

In [13]:
ideal_test_error

1.0060469050341014

In [14]:
rmse_train_lr, rmse_test_lr = eval_model(sklearn.linear_model.LinearRegression())

In [15]:
rmse_train_lr

44.36225365407323

In [16]:
rmse_test_lr

1.8217317668227592e+28

In [17]:
rmse_train_l2, rmse_test_l2 = eval_model(sklearn.linear_model.Ridge())

In [18]:
rmse_train_l2

0.5270230243496119

In [19]:
rmse_test_l2

1.0197445132259777

In [20]:
rmse_train_l2s, rmse_test_l2s = eval_model(
    sklearn.pipeline.Pipeline([
        ('scaler', vtreat.effect_scaler.EffectScaler()),
        ("model", sklearn.linear_model.Ridge()),
    ])
)

In [21]:
rmse_train_l2s

0.5002190679213362

In [22]:
rmse_test_l2s

1.0283492431140724

In [23]:
rmse_train_l2p, rmse_test_l2p = eval_model(
    sklearn.pipeline.Pipeline([
        ('scaler', sklearn.preprocessing.StandardScaler()),
        ('pca', sklearn.decomposition.PCA(n_components=10)),
        ("model", sklearn.linear_model.Ridge()),
    ])
)

In [24]:
rmse_train_l2p

1.011196076320379

In [25]:
rmse_test_l2p

1.0070951730192184

In [26]:
rmse_train_l2e, rmse_test_l2e = eval_model(
    sklearn.pipeline.Pipeline([
        ('scaler', vtreat.effect_scaler.EffectScaler()),
        ('pca', sklearn.decomposition.PCA(n_components=10)),
        ("model", sklearn.linear_model.Ridge()),
    ])
)

In [27]:
rmse_train_l2e

0.9774640909653369

In [28]:
rmse_test_l2e

1.0091428589784588