In [1]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [2]:

# Dict API
result = rs.glm_dict(
    response='ClaimCount',
    terms={'DrivAge': {'type': 'bs'}, 'Region': {'type': 'target_encoding'}},
    data=data,
    family='poisson'
).fit()

# Access smooth term results
if result.has_smooth_terms():
    print(f"Total EDF: {result.total_edf:.2f}")
    print(f"GCV: {result.gcv:.4f}")
    for st in result.smooth_terms:
        print(f"  {st.variable}: EDF={st.edf:.2f}, lambda={st.lambda_:.2f}")

Total EDF: 10.66
GCV: 0.3136
  DrivAge: EDF=8.66, lambda=0.94


In [3]:
print(result.summary())

                                 GLM Results                                  

Family:              poisson         No. Observations:        678012
Link Function:       (default)       Df Residuals:            678001
Method:              IRLS            Df Model:                    10
Scale:               1.0000          Iterations:                   7

Log-Likelihood:         -106321.1293 Deviance:                212642.2587
AIC:                     212664.2587 Null Deviance:           214041.4441
BIC:                     212789.9548 Pearson chi2:              714203.13
Converged:           True           

------------------------------------------------------------------------------
Variable               Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept           -3.2014     0.0752  -42.560  <0.0001   [ -3.3488,  -3.0539]  ***
bs(DrivAge, 1/9)    -0.1801     0.1156   -1.558   0.119

In [4]:
result.predict(holdout_data)

array([0.04395678, 0.05124573, 0.08705769, ..., 0.06700595, 0.05215035,
       0.0609028 ], shape=(135957,))

In [5]:
# Model spec before interaction
terms = {
    "BonusMalus": {"type": "linear"},
    "VehAge": {"type": "ns", "df": 5},
    "DrivAge": {"type": "linear"},
    "Region": {"type": "target_encoding"},
    "VehBrand": {"type": "target_encoding"},
    "Area": {"type": "target_encoding"},
}
# Interaction to add (DrivAge x BonusMalus)
interactions = [
    {
        "DrivAge": {"type": "linear"},
        "BonusMalus": {"type": "linear"},
        "include_main": False,
    }
]
# Fit with interaction
result = rs.glm_dict(
    response="ClaimCount",
    terms=terms,
    interactions=interactions,
    data=data,
    family="poisson",
    offset="Exposure",
).fit()


In [None]:
print(result.summary())

In [None]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [5]:
#writes analysis/diagnostics
result.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)



In [6]:
predictions = result.predict(holdout_data)

In [7]:
model_bytes = result.to_bytes()
with open("model.bin", "wb") as f:
    f.write(model_bytes)

In [8]:
with open("model.bin", "rb") as f:
    loaded = rs.GLMModel.from_bytes(f.read())

In [9]:
predictions2 = loaded.predict(holdout_data)

In [10]:
import numpy as np
print(np.array_equal(predictions2, predictions))

True


In [None]:
#writes analysis/diagnostics
loaded.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

In [None]:
result2 = rs.glm_dict(
    response="ClaimCount",
    terms={
        "Region": {"type": "target_encoding"},
        "BonusMalus": {"type": "linear"},
        "BonusMalus2": {"type": "expression", "expr": "BonusMalus ** 2"}
    },
    data=train_data,
    family="poisson",
    offset="Exposure",
).fit(
    regularization="elastic_net"
)

In [None]:
#writes analysis/diagnostics
result2.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)