In [None]:
import polars as pl

import rustystats as rs

data = pl.read_parquet("https://raw.githubusercontent.com/PricingFrontier/pricing-data-example/917c853e256df8d5814721ab56f72889a908bb08/data/processed/frequency_set.parquet")

In [None]:
exploration = rs.explore_data(
    data=data,
    response="ClaimCount",
    categorical_factors=["Region", "Area"],
    continuous_factors=["VehPower", "VehAge"],
    exposure="Exposure",
)

In [None]:
exploration.to_json()

In [None]:
model = rs.glm(
    formula="ClaimCount ~ VehPower + VehAge + C(Area) + C(Region)",
    data=data,
    family="poisson",
    offset="Exposure"
).fit()

In [None]:
print(model.summary())

In [None]:
# Predict on the data
predictions = model.predict(data, offset="Exposure")
print(f"Predictions shape: {predictions.shape}")
print(f"Mean predicted claim rate: {predictions.mean():.6f}")
print(f"Min: {predictions.min():.6f}, Max: {predictions.max():.6f}")

In [None]:
predictions

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Add predictions to data for plotting
plot_data = data.with_columns([
    pl.lit(predictions).alias("Predicted"),
    pl.col("ClaimCount").alias("Actual")
])

def plot_ae_chart(
    data: pl.DataFrame,
    factor: str,
    bins: list = None,
    use_rate: bool = False,
    figsize: tuple = (12, 4),
    title: str = None,
):
    """
    Plot Actual vs Expected chart with exposure bars and A/E ratio.
    
    Parameters
    ----------
    data : pl.DataFrame
        Data with 'Actual', 'Predicted', and 'Exposure' columns.
    factor : str
        Column name to group by.
    bins : list, optional
        For continuous factors, bin edges. If None, treats factor as categorical.
    use_rate : bool
        If True, plot rates (per exposure) instead of counts.
    figsize : tuple
        Figure size.
    title : str, optional
        Custom title. Defaults to factor name.
    """
    # Bin continuous factors if bins provided
    if bins is not None:
        bin_col = f"{factor}_bin"
        grouped = data.with_columns([
            pl.col(factor).cut(bins).alias(bin_col)
        ]).group_by(bin_col).agg([
            pl.col("Actual").sum().alias("Actual"),
            pl.col("Predicted").sum().alias("Expected"),
            pl.col("Exposure").sum().alias("Exposure")
        ])
        
        # Create proper sort order based on bin edges (not alphabetical)
        bin_labels = grouped[bin_col].to_list()
        # Extract the lower bound from each bin label for sorting
        def get_bin_lower(label):
            if label is None:
                return float('inf')
            s = str(label)
            # Handle formats like "(0, 2]" or "[0, 2)"
            try:
                return float(s.split(',')[0].strip('(['))
            except:
                return float('inf')
        
        sort_order = sorted(range(len(bin_labels)), key=lambda i: get_bin_lower(bin_labels[i]))
        grouped = grouped[sort_order]
        labels = [str(b) for b in grouped[bin_col].to_list()]
    else:
        grouped = data.group_by(factor).agg([
            pl.col("Actual").sum().alias("Actual"),
            pl.col("Predicted").sum().alias("Expected"),
            pl.col("Exposure").sum().alias("Exposure")
        ]).sort(factor)
        labels = [str(v) for v in grouped[factor].to_list()]
    
    # Calculate A/E and rates
    grouped = grouped.with_columns([
        (pl.col("Actual") / pl.col("Expected")).alias("A/E"),
        (pl.col("Actual") / pl.col("Exposure")).alias("Actual_Rate"),
        (pl.col("Expected") / pl.col("Exposure")).alias("Expected_Rate")
    ])
    
    fig, axes = plt.subplots(1, 2, figsize=figsize)
    x = np.arange(len(labels))
    
    # Left: Exposure bar with Actual/Expected lines
    ax1 = axes[0]
    ax2 = ax1.twinx()
    
    ax1.bar(x, grouped["Exposure"].to_numpy(), alpha=0.3, color="gray", label="Exposure")
    ax1.set_ylabel("Exposure", color="gray")
    ax1.tick_params(axis="y", labelcolor="gray")
    ax1.set_ylim(bottom=0)
    
    if use_rate:
        y_actual = grouped["Actual_Rate"].to_numpy()
        y_expected = grouped["Expected_Rate"].to_numpy()
        y_label = "Claim Rate"
    else:
        y_actual = grouped["Actual"].to_numpy()
        y_expected = grouped["Expected"].to_numpy()
        y_label = "Claim Count"
    
    ax2.plot(x, y_actual, "o-", color="steelblue", linewidth=2, markersize=8, label="Actual")
    ax2.plot(x, y_expected, "s--", color="coral", linewidth=2, markersize=8, label="Expected")
    ax2.set_ylabel(y_label, color="black")
    ax2.set_ylim(bottom=0)
    
    ax1.set_xticks(x)
    ax1.set_xticklabels(labels, rotation=45 if bins else 0, ha="right" if bins else "center")
    ax1.set_xlabel(title or factor)
    ax1.set_title(f"Actual vs Expected by {title or factor}")
    ax2.legend(loc="upper left")
    
    # Right: A/E ratio
    ae_vals = grouped["A/E"].to_numpy()
    colors = ["green" if 0.95 <= v <= 1.05 else "orange" if 0.9 <= v <= 1.1 else "teal" for v in ae_vals]
    axes[1].bar(labels, ae_vals, color=colors)
    axes[1].axhline(y=1.0, color="red", linestyle="--", label="Perfect calibration")
    axes[1].set_xticks(range(len(labels)))
    axes[1].set_xticklabels(labels, rotation=45 if bins else 0, ha="right" if bins else "center")
    axes[1].set_xlabel(title or factor)
    axes[1].set_ylabel("A/E Ratio")
    axes[1].set_title(f"A/E Ratio by {title or factor}")
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
    return grouped

In [None]:
# Actual vs Expected by Area
plot_ae_chart(plot_data, "Area", use_rate=True)

In [None]:
# Actual vs Expected by Region
plot_ae_chart(plot_data, "Region", use_rate=True)

In [None]:
# Actual vs Expected by VehAge (binned)
plot_ae_chart(plot_data, "VehAge", bins=[0, 2, 4, 6, 8, 10, 15, 20], use_rate=True, title="Vehicle Age")

In [None]:
# Actual vs Expected by VehPower (binned)
plot_ae_chart(plot_data, "VehPower", bins=[4, 5, 6, 7, 8, 9, 10, 12, 15], use_rate=True, title="Vehicle Power")

In [None]:
# Get diagnostics object
diag = model.diagnostics(
    data=data,
    categorical_factors=["Region", "Area"],
    continuous_factors=["VehPower", "VehAge"]
)

# Export as JSON for LLM consumption
json_output = diag.to_json()

In [None]:
model = rs.glm(
    formula="ClaimCount ~ VehPower + VehAge + C(Area) + C(Region)",
    data=data,
    family="quasipoisson",
    offset="Exposure",
).fit()

In [None]:
model = rs.glm(
    formula="ClaimCount ~ VehPower + VehAge + C(Area) + C(Region)",
    data=data,
    family="negbinomial",
    offset="Exposure"
).fit()

In [None]:
print(model.summary())

In [None]:
data.columns

In [None]:
formula = """
    ClaimCount ~
    ns(VehPower, df=4) +
    ns(VehAge, df=4) +
    ns(DrivAge, df=4) +
    ns(BonusMalus, df=4) +
    TE(VehBrand) +
    TE(VehGas) +
    TE(Area) +
    ns(Density, df=4) +
    TE(Region)
"""

model = rs.glm(
    formula=formula,
    data=data,
    family="poisson",
    offset="Exposure"
).fit()

In [None]:
print(model.summary())

In [None]:
# Predict on the data
predictions = model.predict(data, offset="Exposure")
print(f"Predictions shape: {predictions.shape}")
print(f"Mean predicted claim rate: {predictions.mean():.6f}")
print(f"Min: {predictions.min():.6f}, Max: {predictions.max():.6f}")

plot_data = data.with_columns([
    pl.lit(predictions).alias("Predicted"),
    pl.col("ClaimCount").alias("Actual")
])

In [None]:
plot_ae_chart(plot_data, "Area", use_rate=True)

In [None]:
plot_ae_chart(plot_data, "Region", use_rate=True)

In [None]:
# Actual vs Expected by VehAge (binned)
plot_ae_chart(plot_data, "VehAge", bins=[0, 2, 4, 6, 8, 10, 15, 20], use_rate=True, title="Vehicle Age")

In [None]:
# Actual vs Expected by VehPower (binned)
plot_ae_chart(plot_data, "VehPower", bins=[4, 5, 6, 7, 8, 9, 10, 12, 15], use_rate=True, title="Vehicle Power")

In [None]:
# Continuous × Continuous interaction (main effects + interaction)
result = rs.glm(
    "ClaimCount ~ VehAge*VehPower",  # Equivalent to Age + VehPower + Age:VehPower
    data, family="poisson", offset="Exposure"
).fit()

print(result.summary())

In [None]:
# Categorical × Continuous interaction
result = rs.glm(
    "ClaimCount ~ C(Area)*VehAge",  # Each area level has different age effect
    data, family="poisson", offset="Exposure"
).fit()

print(result.summary())

In [None]:
# Categorical × Categorical interaction
result = rs.glm(
    "ClaimCount ~ C(Area)*C(VehBrand)",
    data, family="poisson", offset="Exposure"
).fit()

print(result.summary())

In [None]:
# Pure interaction (no main effects added)
result = rs.glm(
    "ClaimCount ~ VehAge + C(Area):VehPower",  # Area-specific VehPower slopes
    data, family="poisson", offset="Exposure"
).fit()

In [None]:
print(result.summary())

In [None]:
result = rs.glm(
    "ClaimCount ~ bs(VehAge, df=5) + C(Region) + C(Area):VehPower + C(Area)*C(VehBrand)",
    data, 
    family="poisson"
).fit()

plot_data = data.with_columns([
    pl.lit(predictions).alias("Predicted"),
    pl.col("ClaimCount").alias("Actual")
])

In [None]:
print(result.summary())