In [1]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

In [2]:
train_data = data.filter(pl.col("Group") != "5")
test_data = data.filter(pl.col("Group") == "5")

In [3]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [4]:
# fits model
model = rs.glm(
    formula="ClaimCount ~ TE(VehBrand) + bs(BonusMalus, df=4) + bs(VehAge, df=4) + bs(DrivAge, df=4) + C(Area) + bs(VehPower, df=4) + TE(VehGas) + C(Area):bs(VehAge, df=4) + C(Area):bs(DrivAge, df=4)",
    data=train_data,
    family="negbinomial",
    offset="Exposure"
).fit()

In [5]:
print(model.summary())

                                 GLM Results                                  

Family:              NegativeBinomial No. Observations:        542055
Link Function:       (default)       Df Residuals:            542005
Method:              IRLS + Ridge    Df Model:                    49
Scale:               0.2899          Alpha (Î»):               0.0000
L1 Ratio:            0.00            Iterations:                   9
Non-zero coefs:      49             

Log-Likelihood:         -112458.0469 Deviance:                157110.4632
AIC:                     225016.0939 Null Deviance:           149116.5514
BIC:                     225576.2500 Pearson chi2:             1402623.48
Converged:           True           

------------------------------------------------------------------------------
Variable                         Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept             

In [6]:
#writes analysis/diagnostics
model.diagnostics(
    train_data=train_data,
    test_data = test_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

