In [3]:
import polars as pl

import rustystats as rs

import numpy as np


In [4]:

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/severity_set.parquet")
data = data.filter(pl.col("ClaimAmount") > 0)
data = data.filter(pl.col("ClaimAmount") < 50000)

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [5]:
exploration = rs.explore_data(
    data=data,
    response="ClaimAmount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas", "Region"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

In [6]:
terms = {
    'Region': {'type': 'target_encoding'},
    'Area': {'type': 'target_encoding'},
    'VehBrand': {'type': 'target_encoding'}
}

result = rs.glm_dict(
    response='ClaimAmount',
    terms=terms,
    data=data,
    family='gamma'
).fit()

In [7]:
print(result.summary())

                                 GLM Results                                  

Family:              Gamma           No. Observations:         26354
Link Function:       (default)       Df Residuals:             26350
Method:              IRLS            Df Model:                     3
Scale:               3.0079          Iterations:                   6

Log-Likelihood:         -229185.3493 Deviance:                 28445.3518
AIC:                     458378.6986 Null Deviance:            28513.7001
BIC:                     458411.4161 Pearson chi2:               79258.52
Converged:           True           

------------------------------------------------------------------------------
Variable               Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept            6.6055     0.2787   23.702  <0.0001   [  6.0593,   7.1518]  ***
TE(Region)           0.0003     0.0001    4.210  <0.000

In [8]:
diagnostics = result.diagnostics(
    data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus", "Density"]
)