In [1]:
import polars as pl

import rustystats as rs

import numpy as np


In [2]:

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/severity_set.parquet")
data = data.filter(pl.col("ClaimAmount") > 0)
data = data.filter(pl.col("ClaimAmount") < 50000)

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [3]:
exploration = rs.explore_data(
    data=train_data,
    response="ClaimAmount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas", "Region"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

In [4]:
terms = {
    'VehAge': {'type': 'ns'},
    'BonusMalus': {'type': 'ns'},
    'Area': {'type': 'target_encoding'}
}

result = rs.glm_dict(
    response='ClaimAmount',
    terms=terms,
    data=train_data,
    family='gamma'
).fit()


In [5]:
print(result.summary())

                                 GLM Results                                  

Family:              Gamma           No. Observations:         15829
Link Function:       (default)       Df Residuals:             15809
Method:              IRLS + Ridge    Df Model:                    19
Scale:               2.9243          Alpha (Î»):             111.0518
L1 Ratio:            0.00            Iterations:                   6
Non-zero coefs:      19             

Log-Likelihood:         -137490.1116 Deviance:                 16892.9494
AIC:                     275020.2233 Null Deviance:            16985.6583
BIC:                     275173.6152 Pearson chi2:               46230.93
Converged:           True           

------------------------------------------------------------------------------
Variable                  Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept               7.2595

In [6]:
diagnostics = result.diagnostics(
    train_data=train_data,
    test_data = test_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus", "Density"]
)

In [7]:
result.predict(holdout_data)

array([1771.54629005, 1584.20190807, 1681.94235846, ..., 1654.6041677 ,
       1633.03067019, 1734.15067297], shape=(5326,))

In [8]:
result.predict(holdout_data.limit(5))

array([1771.54629005, 1584.20190807, 1681.94235846, 1855.81613443,
       1642.93403728])

In [9]:
len(holdout_data)

5326