In [1]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [2]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [3]:
# CV-based regularization selection (recommended)
result = rs.glm(
    "ClaimCount ~ TE(Region) + BonusMalus + I(BonusMalus ** 2) + ns(VehAge, df=4) + DrivAge + I(DrivAge ** 2) + TE(VehBrand) + TE(Area)", 
    train_data, 
    family="poisson", 
    offset="Exposure").fit(
)


In [4]:
print(result.summary())

                                 GLM Results                                  

Family:              Poisson         No. Observations:        406439
Link Function:       (default)       Df Residuals:            406428
Method:              IRLS            Df Model:                    10
Scale:               1.0000          Iterations:                   7

Log-Likelihood:          -84729.3794 Deviance:                127911.1092
AIC:                     169480.7589 Null Deviance:           132328.3090
BIC:                     169600.8260 Pearson chi2:          4161425069.53
Converged:           True           

------------------------------------------------------------------------------
Variable                 Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept             -4.7226     0.1342  -35.197  <0.0001   [ -4.9856,  -4.4596]  ***
BonusMalus             0.0337     0.0023   14.729  

In [5]:
#writes analysis/diagnostics
result.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

