# Pandera + Ibis vs Kontra: Side-by-Side Comparison

This notebook demonstrates the key differences between Pandera (with Ibis backend) and Kontra for data validation.

## Setup

In [None]:
# Install if needed (uncomment)
# !pip install pandera ibis-framework kontra polars

In [None]:
import polars as pl
import pandas as pd
import time

# Pandera
import pandera as pa
import pandera.polars as pap

# Kontra
import kontra
from kontra import rules

print(f"Pandera version: {pa.__version__}")
print(f"Kontra version: {kontra.__version__}")

## Create Test Data

1M rows with intentional data quality issues

In [None]:
import random

n_rows = 1_000_000

# Create data with known issues
df = pl.DataFrame({
    "user_id": list(range(n_rows)),
    "email": [f"user{i}@example.com" if i % 100 != 0 else None for i in range(n_rows)],  # 1% null
    "age": [random.randint(18, 80) if i % 200 != 0 else -5 for i in range(n_rows)],  # 0.5% negative
    "status": [random.choice(["active", "inactive", "pending"]) if i % 500 != 0 else "INVALID" for i in range(n_rows)],
    "start_date": ["2024-01-01"] * n_rows,
    "end_date": ["2024-12-31" if i % 1000 != 0 else "2023-12-31" for i in range(n_rows)],  # 0.1% bad dates
    "shipping_date": ["2024-02-01" if i % 3 == 0 else None for i in range(n_rows)],
})

# Add status column for conditional test
df = df.with_columns(
    pl.when(pl.col("shipping_date").is_not_null())
    .then(pl.lit("shipped"))
    .otherwise(pl.lit("pending"))
    .alias("order_status")
)

# Introduce some failures for conditional test (shipped but no shipping_date)
# We'll manually create this scenario
df = df.with_columns(
    pl.when(pl.col("user_id") % 2000 == 0)
    .then(pl.lit("shipped"))  # Force shipped status
    .otherwise(pl.col("order_status"))
    .alias("order_status")
)

print(f"Created {len(df):,} rows")
df.head()

---

# Test 1: Basic Validation (not_null, range, allowed_values)

Compare how each tool handles basic validation rules.

## Pandera

In [None]:
# Define Pandera schema
pandera_schema = pap.DataFrameSchema({
    "user_id": pap.Column(int, nullable=False, unique=True),
    "email": pap.Column(str, nullable=False),  # Will fail - we have nulls
    "age": pap.Column(int, pa.Check.ge(0)),    # Will fail - we have negatives
    "status": pap.Column(str, pa.Check.isin(["active", "inactive", "pending"])),  # Will fail
})

print("Pandera schema defined")
print(pandera_schema)

In [None]:
# Run Pandera validation
start = time.time()
try:
    pandera_schema.validate(df, lazy=True)
    print("Pandera: PASSED")
except pa.errors.SchemaErrors as e:
    elapsed = time.time() - start
    print(f"Pandera: FAILED in {elapsed:.2f}s")
    print(f"\nNumber of schema errors: {len(e.schema_errors)}")
    print(f"\nError message size: {len(str(e)):,} characters")
    print(f"\nFirst 500 chars of error:\n{str(e)[:500]}...")

## Kontra

In [None]:
# Run Kontra validation
start = time.time()
result = kontra.validate(df, rules=[
    rules.not_null("user_id"),
    rules.unique("user_id"),
    rules.not_null("email"),
    rules.range("age", min=0),
    rules.allowed_values("status", ["active", "inactive", "pending"]),
])
elapsed = time.time() - start

print(f"Kontra: {'PASSED' if result.passed else 'FAILED'} in {elapsed:.2f}s")
print(f"\nResult summary:")
print(result)

In [None]:
# Kontra detailed results
print("\nDetailed rule results:")
for r in result.rule_results:
    status = "✓" if r.passed else "✗"
    print(f"  {status} {r.rule_name}({r.column}): {r.fail_count:,} failures")

---

# Test 2: Error Output Comparison

See how each tool reports failures.

In [None]:
# Small dataset to see full error output
small_df = pl.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "email": ["a@b.com", None, "c@d.com", None, "e@f.com"],
    "age": [25, 30, -5, 40, 150],
})
print("Small test data:")
print(small_df)

In [None]:
# Pandera error output
small_schema = pap.DataFrameSchema({
    "email": pap.Column(str, nullable=False),
    "age": pap.Column(int, pa.Check.in_range(0, 120)),
})

print("=== PANDERA ERROR OUTPUT ===")
try:
    small_schema.validate(small_df, lazy=True)
except pa.errors.SchemaErrors as e:
    print(e)

In [None]:
# Kontra error output
print("=== KONTRA ERROR OUTPUT ===")
result = kontra.validate(small_df, rules=[
    rules.not_null("email"),
    rules.range("age", min=0, max=120),
])
print(result)

print("\n--- Sample failures ---")
for r in result.rule_results:
    if not r.passed and r.sample_failures:
        print(f"\n{r.rule_name}({r.column}):")
        print(r.sample_failures)

---

# Test 3: Conditional Validation

`shipping_date` must not be null when `order_status == 'shipped'`

In [None]:
# Check how many rows should fail this rule
failures = df.filter(
    (pl.col("order_status") == "shipped") & 
    (pl.col("shipping_date").is_null())
)
print(f"Expected failures: {len(failures):,} rows")
print("\nSample:")
print(failures.head())

## Pandera: Custom Check Required

In [None]:
# Pandera requires custom check for conditional validation
# This is a dataframe-level check, not column-level

@pa.check("shipping_date", name="shipping_date_when_shipped")
def check_shipping_date(df: pl.DataFrame) -> pl.Series:
    """shipping_date required when order_status is 'shipped'"""
    return ~(
        (df["order_status"] == "shipped") & 
        (df["shipping_date"].is_null())
    )

conditional_schema = pap.DataFrameSchema({
    "order_status": pap.Column(str),
    "shipping_date": pap.Column(str, nullable=True, checks=[check_shipping_date]),
})

print("Pandera conditional schema (requires custom @pa.check decorator):")
print(conditional_schema)

In [None]:
# Run Pandera conditional validation
start = time.time()
try:
    conditional_schema.validate(df, lazy=True)
    print("Pandera: PASSED")
except pa.errors.SchemaErrors as e:
    elapsed = time.time() - start
    print(f"Pandera: FAILED in {elapsed:.2f}s")
    print(f"Error size: {len(str(e)):,} characters")

## Kontra: Built-in Rule

In [None]:
# Kontra has built-in conditional_not_null
start = time.time()
result = kontra.validate(df, rules=[
    rules.conditional_not_null("shipping_date", when="order_status == 'shipped'"),
])
elapsed = time.time() - start

print(f"Kontra: {'PASSED' if result.passed else 'FAILED'} in {elapsed:.2f}s")
print(f"\nFailures: {result.rule_results[0].fail_count:,}")
print("\nSample failures:")
print(result.rule_results[0].sample_failures)

---

# Test 4: Cross-Column Comparison

`end_date >= start_date`

In [None]:
# Check expected failures
date_failures = df.filter(pl.col("end_date") < pl.col("start_date"))
print(f"Expected failures: {len(date_failures):,} rows")

## Pandera: Custom Check Required

In [None]:
# Pandera requires custom dataframe-level check
@pa.check("end_date", name="end_after_start")
def check_dates(df: pl.DataFrame) -> pl.Series:
    return df["end_date"] >= df["start_date"]

date_schema = pap.DataFrameSchema({
    "start_date": pap.Column(str),
    "end_date": pap.Column(str, checks=[check_dates]),
})

start = time.time()
try:
    date_schema.validate(df, lazy=True)
    print("Pandera: PASSED")
except pa.errors.SchemaErrors as e:
    elapsed = time.time() - start
    print(f"Pandera: FAILED in {elapsed:.2f}s")
    print(f"Failures: {len(e.schema_errors)}")

## Kontra: Built-in Rule

In [None]:
# Kontra has built-in compare rule
start = time.time()
result = kontra.validate(df, rules=[
    rules.compare("end_date", "start_date", op=">="),
])
elapsed = time.time() - start

print(f"Kontra: {'PASSED' if result.passed else 'FAILED'} in {elapsed:.2f}s")
print(f"Failures: {result.rule_results[0].fail_count:,}")

---

# Test 5: Severity Levels

Kontra supports `blocking`, `warning`, `info` severity levels.

In [None]:
# Pandera: Binary pass/fail only
print("=== PANDERA ===")
print("Pandera has binary pass/fail. No severity levels.")
print("All failures are treated equally.")

In [None]:
# Kontra: Severity levels
print("=== KONTRA ===")
result = kontra.validate(df, rules=[
    rules.not_null("user_id", severity="blocking"),      # Critical - fails validation
    rules.not_null("email", severity="warning"),          # Warning - doesn't fail
    rules.range("age", min=0, max=120, severity="info"),  # Info - just informational
])

print(f"Overall passed: {result.passed}")
print(f"\nBy severity:")
for r in result.rule_results:
    icon = "✓" if r.passed else "✗"
    print(f"  {icon} [{r.severity}] {r.rule_name}({r.column}): {r.fail_count:,} failures")

---

# Test 6: Profiling / Data Discovery

Kontra can profile data to understand it before writing rules.

In [None]:
# Pandera: No profiling
print("=== PANDERA ===")
print("Pandera does not have built-in profiling.")
print("You must know your data schema upfront.")

In [None]:
# Kontra: Built-in profiling
print("=== KONTRA ===")
profile = kontra.profile(small_df)
print(profile)

In [None]:
# Kontra: Generate contract suggestions from data
print("\n=== SUGGESTED RULES ===")
suggestions = kontra.draft(small_df)
print(suggestions)

---

# Test 7: LLM-Friendly Output

Kontra can format results for AI/LLM consumption.

In [None]:
# Pandera: No LLM output
print("=== PANDERA ===")
print("Pandera has no LLM-specific output format.")
print("Error messages can be very large (millions of characters).")

In [None]:
# Kontra: .to_llm() method
print("=== KONTRA ===")
result = kontra.validate(small_df, rules=[
    rules.not_null("email"),
    rules.range("age", min=0, max=120),
])

print("result.to_llm() output:")
print(result.to_llm())

---

# Test 8: Context Metadata

Kontra supports metadata like `owner`, `tags`, `fix_hint`.

In [None]:
# Pandera: No context metadata
print("=== PANDERA ===")
print("Pandera does not support rule-level metadata.")
print("No owner, tags, or fix hints.")

In [None]:
# Kontra: Context metadata
print("=== KONTRA ===")

# Using YAML contract (simulated)
contract_yaml = """
name: users_quality
rules:
  - name: not_null
    params: { column: email }
    severity: blocking
    context:
      owner: data-team@company.com
      tier: 1
      fix_hint: "Check ETL pipeline for null injection"
      tags: [pii, critical]
"""
print("Example contract with metadata:")
print(contract_yaml)

---

# Summary Table

| Feature | Pandera + Ibis | Kontra |
|---------|----------------|--------|
| **Profiling / Draft** | ✗ | ✓ |
| **CLI** | ✗ | ✓ |
| **State / Diff** | ✗ | ✓ |
| **Conditional checks** | Custom code | Built-in |
| **Cross-column compare** | Custom code | Built-in |
| **Severity levels** | Binary only | blocking/warning/info |
| **Error output** | ALL failures | Counts + samples |
| **LLM output** | ✗ | .to_llm() |
| **Context metadata** | ✗ | owner, tags, fix_hint |
| **Backends** | 20+ via Ibis | Postgres, SQL Server, DuckDB |
| **Type strictness** | Schema-focused | Rule-focused |

In [None]:
print("Done! Run each cell to see the differences.")