In [1]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [2]:

# Dict API
result = rs.glm_dict(
    response='ClaimCount',
    terms={'DrivAge': {'type': 'bs', 'k': 5}, 'Region': {'type': 'target_encoding'}},
    data=data,
    family='poisson'
).fit()

# Access smooth term results
if result.has_smooth_terms():
    print(f"Total EDF: {result.total_edf:.2f}")
    print(f"GCV: {result.gcv:.4f}")
    for st in result.smooth_terms:
        print(f"  {st.variable}: EDF={st.edf:.2f}, lambda={st.lambda_:.2f}")

Total EDF: 5.99
GCV: 0.3138
  DrivAge: EDF=3.99, lambda=0.10


In [3]:
print(result.summary())

                                 GLM Results                                  

Family:              poisson         No. Observations:        678012
Link Function:       (default)       Df Residuals:            678006
Method:              IRLS            Df Model:                     5
Scale:               1.0000          Iterations:                   6

Log-Likelihood:         -106380.1230 Deviance:                212760.2461
AIC:                     212772.2461 Null Deviance:           214041.4441
BIC:                     212840.8076 Pearson chi2:              712756.10
Converged:           True           

------------------------------------------------------------------------------
Variable               Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept           -2.9916     0.0518  -57.704  <0.0001   [ -3.0932,  -2.8900]  ***
bs(DrivAge, 1/4)    -1.4231     0.0556  -25.585  <0.000

In [4]:
result.predict(holdout_data)

array([0.04634741, 0.05223384, 0.08438057, ..., 0.06279015, 0.05192852,
       0.0580746 ], shape=(135957,))

In [5]:
# Model spec before interaction
terms = {
    "BonusMalus": {"type": "linear"},
    "VehAge": {"type": "ns", "df": 5},
    "DrivAge": {"type": "linear"},
    "Region": {"type": "target_encoding"},
    "VehBrand": {"type": "target_encoding"},
    "Area": {"type": "target_encoding"},
}
# Interaction to add (DrivAge x BonusMalus)
interactions = [
    {
        "DrivAge": {"type": "linear"},
        "BonusMalus": {"type": "linear"},
        "include_main": False,
    }
]
# Fit with interaction
result = rs.glm_dict(
    response="ClaimCount",
    terms=terms,
    interactions=interactions,
    data=data,
    family="poisson",
    offset="Exposure",
).fit()


In [6]:
print(result.summary())

                                 GLM Results                                  

Family:              Poisson         No. Observations:        678012
Link Function:       (default)       Df Residuals:            678001
Method:              IRLS            Df Model:                    10
Scale:               1.0000          Iterations:                   6

Log-Likelihood:         -139947.7402 Deviance:                210895.3342
AIC:                     279917.4804 Null Deviance:           220244.0296
BIC:                     280043.1765 Pearson chi2:             1534257.93
Converged:           True           

------------------------------------------------------------------------------
Variable                 Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept             -3.5055     0.0698  -50.228  <0.0001   [ -3.6423,  -3.3687]  ***
BonusMalus             0.0236     0.0008   29.308  

In [7]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [8]:
#writes analysis/diagnostics
result.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)



In [9]:
predictions = result.predict(holdout_data)

In [10]:
model_bytes = result.to_bytes()
with open("model.bin", "wb") as f:
    f.write(model_bytes)

In [11]:
with open("model.bin", "rb") as f:
    loaded = rs.FormulaGLMResults.from_bytes(f.read())

In [12]:
predictions2 = loaded.predict(holdout_data)

In [17]:
import numpy as np
print(np.array_equal(predictions2, predictions))

True


In [None]:
result2 = rs.glm_dict(
    response="ClaimCount",
    terms={
        "Region": {"type": "target_encoding"},
        "BonusMalus": {"type": "linear"},
        "BonusMalus2": {"type": "expression", "expr": "BonusMalus ** 2"}
    },
    data=train_data,
    family="poisson",
    offset="Exposure",
).fit(
    regularization="elastic_net"
)

In [7]:
#writes analysis/diagnostics
result2.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

