In [1]:
import polars as pl

import rustystats as rs

import numpy as np


In [2]:

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/severity_set.parquet")
data = data.filter(pl.col("ClaimAmount") > 0)
data = data.filter(pl.col("ClaimAmount") < 50000)

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [3]:
exploration = rs.explore_data(
    data=train_data,
    response="ClaimAmount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas", "Region"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

In [4]:
terms = {
    'VehAge': {'type': 'ns'},
    'BonusMalus': {'type': 'ns'},
    'Area': {'type': 'target_encoding'}
}

result = rs.glm_dict(
    response='ClaimAmount',
    terms=terms,
    data=train_data,
    family='gamma'
).fit()


In [5]:
print(result.summary())

                                 GLM Results                                  

Family:              Gamma           No. Observations:         15829
Link Function:       (default)       Df Residuals:             15809
Method:              IRLS + Smooth   Df Model:                    19
Scale:               2.9291          Alpha (Î»):               0.0000
L1 Ratio:            0.00            Iterations:                   9
Non-zero coefs:      19             

Log-Likelihood:         -137503.2732 Deviance:                 16895.4774
AIC:                     275046.5464 Null Deviance:            16985.6583
BIC:                     275199.9384 Pearson chi2:               46306.45
Converged:           True           

------------------------------------------------------------------------------
Variable                  Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept               7.3019

In [6]:
diagnostics = result.diagnostics(
    train_data=train_data,
    test_data = test_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus", "Density"]
)

In [7]:
result.predict(holdout_data)

array([1764.60251387, 1570.93540223, 1686.71900596, ..., 1659.02445064,
       1635.14949393, 1728.14959537], shape=(5326,))

In [8]:
result.predict(holdout_data.limit(5))

array([1764.60251387, 1570.93540223, 1686.71900596, 1830.61858252,
       1665.23865902])

In [9]:
len(holdout_data)

5326