In [5]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [6]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [7]:
formula = """
    ClaimCount ~ 
    TE(Region) + 
    BonusMalus + 
    I(BonusMalus ** 2) + 
    ms(VehAge, df=4) + 
    pos(DrivAge) + 
    pos(I(DrivAge ** 2)) + 
    TE(VehBrand) + 
    TE(Area), 
"""

result = rs.glm(
    formula,
    train_data, 
    family="poisson", 
    offset="Exposure").fit(
)

print(result.summary())

                                 GLM Results                                  

Family:              Poisson         No. Observations:        406439
Link Function:       (default)       Df Residuals:            406427
Method:              IRLS            Df Model:                    11
Scale:               1.0000          Iterations:                  25

Log-Likelihood:         -118837.9676 Deviance:                196128.2855
AIC:                     237699.9352 Null Deviance:           132328.3090
BIC:                     237830.9175 Pearson chi2:              390442.38
Converged:           False          

------------------------------------------------------------------------------
Variable                   Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept               -4.2916     0.0746  -57.526  <0.0001   [ -4.4378,  -4.1454]  ***
BonusMalus               0.0320     0.0013   25

In [8]:
#writes analysis/diagnostics
result.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)



In [9]:
result.predict(train_data)

array([0.04535699, 0.47292428, 0.01271514, ..., 0.01360539, 0.21669988,
       0.18785975], shape=(406439,))