In [1]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

In [2]:
train_data = data.filter(pl.col("Group") != "5")
test_data = data.filter(pl.col("Group") == "5")

In [3]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [4]:
data.select('Region').unique()

Region
str
"""Haute-Normandie"""
"""Limousin"""
"""Franche-Comte"""
"""Rhone-Alpes"""
"""Champagne-Ardenne"""
…
"""Poitou-Charentes"""
"""Bourgogne"""
"""Languedoc-Roussillon"""
"""Corse"""


In [5]:
# fits model
model = rs.glm(
    formula="ClaimCount ~ TE(Region) + TE(Area) + TE(VehBrand) + VehAge + BonusMalus + DrivAge",
    data=train_data,
    family="negbinomial",
    offset="Exposure"
).fit()

model2 = rs.glm(
    formula="ClaimCount ~ TE(Region) + TE(Area) + VehAge + BonusMalus + DrivAge",
    data=train_data,
    family="negbinomial",
    offset="Exposure"
).fit()

In [6]:
print(model.summary())

                                 GLM Results                                  

Family:              NegativeBinomial(theta=1.8927) No. Observations:        542055
Link Function:       (default)       Df Residuals:            542048
Method:              IRLS + Ridge    Df Model:                     6
Scale:               0.2905          Alpha (λ):               0.0000
L1 Ratio:            0.00            Iterations:                   6
Non-zero coefs:      6              

Log-Likelihood:         -112675.1525 Deviance:                157458.8692
AIC:                     225364.3049 Null Deviance:            58099.8532
BIC:                     225442.7268 Pearson chi2:             1312678.64
Converged:           True           

------------------------------------------------------------------------------
Variable               Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept          

In [7]:
#writes analysis/diagnostics
model.diagnostics(
    train_data=train_data,
    test_data = test_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)



In [8]:
#writes analysis/diagnostics
model2.diagnostics(
    train_data=train_data,
    test_data = test_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

