In [1]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

In [2]:
train_data = data.filter(pl.col("Group") != "5")
test_data = data.filter(pl.col("Group") == "5")

In [3]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [4]:
# CV-based regularization selection (recommended)
result = rs.glm(
    "ClaimCount ~ VehAge + BonusMalus + TE(Region)", 
    data, 
    family="negbinomial", 
    offset="Exposure").fit(
    regularization="elastic_net",
)

print(f"Selected alpha: {result.alpha}")
print(f"CV deviance: {result.cv_deviance}")

Selected alpha: 1e-06
CV deviance: 0.2757213187955567


In [5]:
print(result.summary())

                                 GLM Results                                  

Family:              NegativeBinomial(theta=1.5420) No. Observations:        678012
Link Function:       (default)       Df Residuals:            678008
Method:              IRLS + Ridge    Df Model:                     3
Scale:               0.2873          Alpha (Î»):               0.0000
L1 Ratio:            0.00            Iterations:                  10
Non-zero coefs:      3              

Log-Likelihood:         -141369.1430 Deviance:                194792.8018
AIC:                     282746.2860 Null Deviance:            70452.7426
BIC:                     282791.9937 Pearson chi2:             1663586.35
Converged:           True           

------------------------------------------------------------------------------
Variable               Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept         

In [6]:
#writes analysis/diagnostics
result.diagnostics(
    train_data=train_data,
    test_data = test_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

