(example-sparse)=

 Example: Using Sparse Covariate Matrices
=============================

Motivation
----------

In many applications, we want to adjust for categorical covariates with many levels. As a natural pre-processing step, this may involve one-hot-encoding the covariates, which can lead to a high-dimensional covariate matrix, which is typically very sparse. Many scikit-style learners accept (scipy's) sparse matrices as input, which allows us to use them for treatment effect estimation as well. 

Example
-------

In [None]:
import time, psutil, os, gc
import numpy as np
import pandas as pd
import scipy as sp

from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from lightgbm import LGBMRegressor, LGBMClassifier
from metalearners import DRLearner

# This is required for when nbconvert converts the cell-magic to regular function calls.
from IPython import get_ipython

In [None]:
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024  # in MB


## Causal Inference

### DRLearner



We generate some data where X comprises of 100 categorical variables with 1000 possible levels. Naively one-hot-encoding this data produces a very large matrix with many zeroes, which is an ideal application of `scipy.sparse.csr_matrix`. We then use the `DRLearner` to estimate the treatment effect. 


In [None]:
def generate_causal_data(
    n_samples=100_000,
    n_categories=1000,
    n_features=100,
    tau_magnitude=1.0,
):
    ######################################################################
    # Generate covariate matrix X
    X = np.random.randint(0, n_categories, size=(n_samples, n_features))
    ######################################################################
    # Generate potential outcome y0
    y0 = np.zeros(n_samples)
    # Select a few features for main effects
    main_effect_features = np.random.choice(n_features, 3, replace=False)
    # Create main effects - fully dense
    for i in main_effect_features:
        category_effects = np.random.normal(0, 4, n_categories)
        y0 += category_effects[X[:, i]]
    # Select a couple of feature pairs for interaction effects
    interaction_pairs = [
        (i, j) for i in range(n_features) for j in range(i + 1, n_features)
    ]
    selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)
    # Create interaction effects
    for idx in selected_interactions:
        i, j = interaction_pairs[idx]
        interaction_effect = np.random.choice(
            [-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]
        )
        y0 += interaction_effect[X[:, i], X[:, j]]
    # Normalize y0
    y0 = (y0 - np.mean(y0)) / np.std(y0)
    y0 += np.random.normal(0, 0.1, n_samples)
    ######################################################################
    # Generate treatment assignment W
    propensity_score = np.zeros(n_samples)
    for i in main_effect_features:
        category_effects = np.random.normal(0, 4, n_categories)
        propensity_score += category_effects[X[:, i]]
    # same interactions enter pscore
    # Create interaction effects
    for idx in selected_interactions:
        i, j = interaction_pairs[idx]
        interaction_effect = np.random.choice(
            [-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]
        )
        propensity_score += interaction_effect[X[:, i], X[:, j]]
    # Convert to probabilities using logistic function
    propensity_score = sp.special.expit(propensity_score)
    # Generate binary treatment
    W = np.random.binomial(1, propensity_score)
    ######################################################################
    # Generate treatment effect
    tau = tau_magnitude * np.ones(n_samples)
    # Generate final outcome
    Y = y0 + W * tau
    return X, W, Y, tau, propensity_score


X, W, Y, tau, propensity_score = generate_causal_data(
    n_samples=10000, tau_magnitude=1.0
)


In [None]:
# sparse and dense X matrices
e1 = OneHotEncoder(sparse_output=True) # onehot encoder generates sparse output automatically
Xdf = pd.DataFrame(X)
X_csr = e1.fit_transform(X)
X_np = pd.get_dummies(Xdf, columns=Xdf.columns).values # dense onehot encoding with pandas

In [None]:
print(f"\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB")
print(f"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB")

As expected, the memory footprint of the sparse matrix is considerably smaller than the dense matrix. 


In [None]:
def fit_drlearner_wrapper(X):
    start_memory = get_memory_usage()
    start_time = time.time()
    metalearners_dr = DRLearner(
        nuisance_model_factory=LGBMRegressor,
        treatment_model_factory=DummyRegressor,
        propensity_model_factory=LGBMClassifier,
        is_classification=False,
        n_variants=2,
        nuisance_model_params={"verbose": -1},
        propensity_model_params={"verbose": -1},
    )

    metalearners_dr.fit_all_nuisance(
        X=X,
        y=Y,
        w=W,
    )
    metalearners_est = metalearners_dr.average_treatment_effect(
        X=X,
        y=Y,
        w=W,
        is_oos=False,
    )
    end_time = time.time()
    end_memory = get_memory_usage()
    runtime = end_time - start_time
    memory_used = end_memory - start_memory
    print(f"Sparse data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB")
    print(metalearners_est)

In [None]:
gc.collect()

`scipy.sparse.csr_matrix` input

In [None]:
fit_drlearner_wrapper(X_csr)
gc.collect()

`np.ndarray` input

In [None]:
fit_drlearner_wrapper(X_np)
gc.collect()

In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. 

## Prediction 

These benefits aren't limited to causal inference. We can also use sparse matrices for prediction tasks as well.

In [None]:
def generate_dummy_data(n_samples=100000, n_categories=1000, n_features=20):
    X = np.random.randint(0, n_categories, size=(n_samples, n_features))
    y = np.zeros(n_samples)
    # Select a few features for main effects
    main_effect_features = np.random.choice(n_features, 3, replace=False)
    # Create main effects
    for i in main_effect_features:
        # Create a random effect for each category
        category_effects = np.random.normal(0, 1, n_categories)
        y += category_effects[X[:, i]]
    # Select a couple of feature pairs for interaction effects
    interaction_pairs = [
        (i, j) for i in range(n_features) for j in range(i + 1, n_features)
    ]
    selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)
    # Create interaction effects
    for idx in selected_interactions:
        i, j = interaction_pairs[idx]
        # Create a sparse interaction effect
        interaction_effect = np.random.choice(
            [-1, 0, 1], size=(n_categories, n_categories), p=[0.05, 0.9, 0.05]
        )
        y += interaction_effect[X[:, i], X[:, j]]
    # Add a non-linear effect for one feature
    nonlinear_feature = np.random.choice(n_features)
    y += np.square(X[:, nonlinear_feature] / (n_categories / 2) - 1)
    y = (y - np.mean(y)) / np.std(y)
    y += np.random.normal(0, 0.1, n_samples)

    return X, y

In [None]:
def prepare_data(X):
    e1 = OneHotEncoder(sparse_output=True)
    # dense - use pd.get_dummies to mimic current practice
    Xdf = pd.DataFrame(X)
    return e1.fit_transform(X), pd.get_dummies(Xdf, columns=Xdf.columns).values

def fit_and_measure(X_train, y_train, X_test, y_test):
    start_memory = get_memory_usage()
    start_time = time.time()
    m = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, verbose=-1)
    m.fit(X_train, y_train)
    end_time = time.time()
    end_memory = get_memory_usage()
    runtime = end_time - start_time
    memory_used = end_memory - start_memory

    # Compute accuracy metrics
    y_pred = m.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return runtime, memory_used, mse, r2

In [None]:
X, y = generate_dummy_data()
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sparse, X_train_dense = prepare_data(X_train)
X_test_sparse, X_test_dense = prepare_data(X_test)

In [None]:
%%time
sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)
gc.collect()

In [None]:
%%time
dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)
gc.collect()

In [None]:
# Mypy can't find these names/variables since they are assigned to via cell-magic.
print(
    f"Sparse data - Runtime: {sparse_runtime:.2f}s, "  # type: ignore[name-defined]
    f"Memory used: {sparse_memory:.2f}MB, "  # type: ignore[name-defined]
    f"MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}"  # type: ignore[name-defined]
) 
print(
    f"Dense data - Runtime: {dense_runtime:.2f}s, "  # type: ignore[name-defined]
    f"Memory used: {dense_memory:.2f}MB, "  # type: ignore[name-defined]
    f"MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}"  # type: ignore[name-defined]
)

print(f"\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB")
print(f"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB")