In [1]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

train_data = data.filter(pl.col("Group") <= "3")
test_data = data.filter(pl.col("Group") == "4")
holdout_data = data.filter(pl.col("Group") == "5")

In [2]:
#creates analysis/exploration.json
exploration = rs.explore_data(
    data=train_data,
    response="ClaimCount",
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ],
    exposure="Exposure",
)

In [3]:
formula = """
    ClaimCount ~ 
    ms(VehAge, df=4, increasing=True)
"""

result = rs.glm(
    formula,
    train_data, 
    family="poisson", 
    offset="Exposure").fit(
)

print(result.summary())

                                 GLM Results                                  

Family:              Poisson         No. Observations:        406439
Link Function:       (default)       Df Residuals:            406434
Method:              IRLS            Df Model:                     4
Scale:               1.0000          Iterations:                  25

Log-Likelihood:         -117115.3232 Deviance:                192682.9966
AIC:                     234240.6463 Null Deviance:           132328.3090
BIC:                     234295.2223 Pearson chi2:              385012.02
Converged:           False          

------------------------------------------------------------------------------
Variable                 Coef    Std.Err        z    P>|z|                 95% CI     
------------------------------------------------------------------------------
Intercept             -1.1961     0.0181  -65.909  <0.0001   [ -1.2317,  -1.1605]  ***
ms(VehAge, 1/4, +)     0.0000     0.0202    0.000  

In [None]:
#writes analysis/diagnostics
result.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

In [None]:
result.predict(train_data)

array([0.03094064, 0.34222329, 0.0200687 , ..., 0.01368893, 0.3853874 ,
       0.35045853], shape=(406439,))

In [None]:
result2 = rs.glm_dict(
    response="ClaimCount",
    terms={
        "Region": {"type": "target_encoding"},
        "BonusMalus": {"type": "linear"},
        "BonusMalus2": {"type": "expression", "expr": "BonusMalus ** 2"},
        "VehAge": {"type": "ms", "df": 4, "monotonicity": "increasing"},
        "DrivAge": {"type": "linear", "monotonicity": "increasing"},
        "DrivAge2": {"type": "expression", "expr": "DrivAge ** 2", "monotonicity": "increasing"},
        "VehBrand": {"type": "target_encoding"},
        "Area": {"type": "target_encoding"},
    },
    data=train_data,
    family="poisson",
    offset="Exposure",
).fit()

In [None]:
#writes analysis/diagnostics
result2.diagnostics(
    train_data=train_data,
    test_data = train_data,
    categorical_factors=["Region", "Area", "VehBrand", "VehGas"],
    continuous_factors=["VehPower", "VehAge", "DrivAge", "BonusMalus" ]
)

