In [1]:
"""
RustyStats vs Glum Performance Comparison
==========================================
High-level benchmarks comparing fitting time and memory usage.
"""
import time
import threading
import gc
import psutil
import numpy as np
import polars as pl
import pandas as pd

import rustystats as rs
from glum import GeneralizedLinearRegressor

In [2]:
# Helper functions for timing and memory measurement

def time_fn(func, n_runs=3):
    """Return median time over n_runs."""
    times = []
    for _ in range(n_runs):
        gc.collect()
        start = time.perf_counter()
        func()
        times.append(time.perf_counter() - start)
    return np.median(times)


def measure_memory(func):
    """Measure peak memory usage during function execution in MB."""
    process = psutil.Process()
    gc.collect()
    
    baseline = process.memory_info().rss
    peak_mem = baseline
    stop_flag = threading.Event()
    
    def monitor():
        nonlocal peak_mem
        while not stop_flag.is_set():
            current = process.memory_info().rss
            peak_mem = max(peak_mem, current)
            time.sleep(0.01)
    
    monitor_thread = threading.Thread(target=monitor)
    monitor_thread.start()
    
    func()
    
    stop_flag.set()
    monitor_thread.join()
    
    return (peak_mem - baseline) / (1024 * 1024)

In [3]:
# Generate synthetic data for benchmarks

def generate_data(n_rows, n_cont=5, n_cat=5, cat_levels=10, family="gaussian", seed=42):
    """Generate benchmark data."""
    rng = np.random.default_rng(seed)
    
    X_cont = rng.standard_normal((n_rows, n_cont))
    X_cat = rng.integers(0, cat_levels, (n_rows, n_cat))
    
    beta = rng.standard_normal(n_cont) * 0.3
    lp = X_cont @ beta
    
    if family == "gaussian":
        y = lp + rng.standard_normal(n_rows) * 0.5
    elif family == "poisson":
        y = rng.poisson(np.exp(np.clip(lp * 0.3, -3, 3))).astype(np.float64)
    elif family == "binomial":
        y = rng.binomial(1, 1 / (1 + np.exp(-lp))).astype(np.float64)
    elif family == "gamma":
        y = np.maximum(rng.gamma(2.0, np.exp(np.clip(lp * 0.3 + 2, 0.5, 5)) / 2.0), 0.001)
    
    # Build dataframes
    data = {f"x{i}": X_cont[:, i] for i in range(n_cont)}
    data.update({f"c{i}": X_cat[:, i].astype(str) for i in range(n_cat)})
    data["y"] = y
    
    df_polars = pl.DataFrame(data)
    df_pandas = df_polars.to_pandas()
    
    return df_polars, df_pandas


N_ROWS = 100_000
print(f"Generating data with {N_ROWS:,} rows...")

Generating data with 100,000 rows...


In [4]:
# Benchmark configurations

FORMULAS = {
    "simple": "y ~ x0 + x1 + x2",
    "categorical": "y ~ x0 + x1 + C(c0) + C(c1)",
    "full": "y ~ x0 + x1 + x2 + x3 + x4 + C(c0) + C(c1) + C(c2) + C(c3) + C(c4)",
}

FAMILIES = ["gaussian", "poisson", "binomial", "gamma"]

results = []

In [5]:
# Run benchmarks

for family in FAMILIES:
    print(f"\n{'='*50}")
    print(f"Family: {family.upper()}")
    print('='*50)
    
    df_pl, df_pd = generate_data(N_ROWS, family=family)
    
    # Build design matrix for glum (one-hot encode categoricals)
    X_glum = pd.get_dummies(df_pd.drop(columns=['y']), columns=[f'c{i}' for i in range(5)], drop_first=True)
    y_glum = df_pd['y'].values
    
    for formula_name, formula in FORMULAS.items():
        print(f"\n  Formula: {formula_name}")
        
        # --- RustyStats ---
        def fit_rs():
            return rs.glm(formula, data=df_pl, family=family).fit()
        
        rs_time = time_fn(fit_rs)
        rs_mem = measure_memory(fit_rs)
        
        # --- Glum ---
        # Select relevant columns based on formula
        if formula_name == "simple":
            X_sub = X_glum[['x0', 'x1', 'x2']]
        elif formula_name == "categorical":
            cols = ['x0', 'x1'] + [c for c in X_glum.columns if c.startswith('c0_') or c.startswith('c1_')]
            X_sub = X_glum[cols]
        else:  # full
            X_sub = X_glum
        
        glum_family = 'normal' if family == 'gaussian' else family
        
        def fit_glum():
            model = GeneralizedLinearRegressor(family=glum_family, alpha=0, fit_intercept=True)
            return model.fit(X_sub, y_glum)
        
        glum_time = time_fn(fit_glum)
        glum_mem = measure_memory(fit_glum)
        
        speedup = glum_time / rs_time if rs_time > 0 else 0
        
        results.append({
            'family': family,
            'formula': formula_name,
            'rs_time': rs_time,
            'rs_mem': rs_mem,
            'glum_time': glum_time,
            'glum_mem': glum_mem,
            'speedup': speedup
        })
        
        print(f"    RustyStats: {rs_time:.3f}s, {rs_mem:.1f}MB")
        print(f"    Glum:       {glum_time:.3f}s, {glum_mem:.1f}MB")
        print(f"    Speedup:    {speedup:.2f}x")
    
    del df_pl, df_pd, X_glum
    gc.collect()


Family: GAUSSIAN

  Formula: simple
    RustyStats: 0.068s, 0.0MB
    Glum:       0.238s, 0.0MB
    Speedup:    3.52x

  Formula: categorical
    RustyStats: 0.220s, 10.7MB
    Glum:       0.659s, 4.1MB
    Speedup:    3.00x

  Formula: full
    RustyStats: 0.453s, 73.3MB
    Glum:       0.875s, 44.8MB
    Speedup:    1.93x

Family: POISSON

  Formula: simple
    RustyStats: 0.135s, 0.0MB
    Glum:       0.420s, 0.0MB
    Speedup:    3.12x

  Formula: categorical
    RustyStats: 0.277s, 0.3MB
    Glum:       0.991s, 4.3MB
    Speedup:    3.57x

  Formula: full
    RustyStats: 0.564s, 39.5MB
    Glum:       1.664s, 45.0MB
    Speedup:    2.95x

Family: BINOMIAL

  Formula: simple
    RustyStats: 0.113s, 0.0MB
    Glum:       0.371s, 0.3MB
    Speedup:    3.29x

  Formula: categorical
    RustyStats: 0.276s, 0.0MB
    Glum:       1.114s, 4.3MB
    Speedup:    4.04x

  Formula: full
    RustyStats: 0.549s, 39.0MB
    Glum:       1.699s, 44.8MB
    Speedup:    3.09x

Family: GAMMA

  Form

In [6]:
# Summary table

results_df = pd.DataFrame(results)

print(f"\n{'='*70}")
print(f"SUMMARY - RustyStats vs Glum ({N_ROWS:,} rows)")
print('='*70)

# Pivot for time comparison
print("\n### Fit Time (seconds)")
time_pivot = results_df.pivot_table(
    index='family', 
    columns='formula', 
    values=['rs_time', 'glum_time'],
    aggfunc='first'
)
print(time_pivot.round(3).to_string())

# Pivot for memory comparison  
print("\n### Peak Memory (MB)")
mem_pivot = results_df.pivot_table(
    index='family',
    columns='formula',
    values=['rs_mem', 'glum_mem'],
    aggfunc='first'
)
print(mem_pivot.round(1).to_string())

# Speedup summary
print("\n### Speedup (glum_time / rs_time)")
speedup_pivot = results_df.pivot_table(
    index='family',
    columns='formula', 
    values='speedup',
    aggfunc='first'
)
print(speedup_pivot.round(2).to_string())

# Overall averages
print(f"\n### Overall Averages")
print(f"RustyStats avg time: {results_df['rs_time'].mean():.3f}s")
print(f"Glum avg time:       {results_df['glum_time'].mean():.3f}s")
print(f"RustyStats avg mem:  {results_df['rs_mem'].mean():.1f}MB")
print(f"Glum avg mem:        {results_df['glum_mem'].mean():.1f}MB")
print(f"Average speedup:     {results_df['speedup'].mean():.2f}x")


SUMMARY - RustyStats vs Glum (100,000 rows)

### Fit Time (seconds)
           glum_time                   rs_time              
formula  categorical   full simple categorical   full simple
family                                                      
binomial       1.114  1.699  0.371       0.276  0.549  0.113
gamma          1.012  1.667  0.352       0.345  0.730  0.144
gaussian       0.659  0.875  0.238       0.220  0.453  0.068
poisson        0.991  1.664  0.420       0.277  0.564  0.135

### Peak Memory (MB)
            glum_mem                   rs_mem             
formula  categorical  full simple categorical  full simple
family                                                    
binomial         4.3  44.8    0.3         0.0  39.0    0.0
gamma            4.3  44.8    0.0         0.0  39.4    0.0
gaussian         4.1  44.8    0.0        10.7  73.3    0.0
poisson          4.3  45.0    0.0         0.3  39.5    0.0

### Speedup (glum_time / rs_time)
formula   categorical  full  simpl