# ü¶Ü DuckGuard ‚Äî Getting Started

Data quality validation in 3 lines of Python. Works with CSV, Parquet, S3, Snowflake, Databricks, BigQuery, and 15+ sources.

[![GitHub](https://img.shields.io/github/stars/XDataHubAI/duckguard?style=social)](https://github.com/XDataHubAI/duckguard)
[![PyPI](https://img.shields.io/pypi/v/duckguard.svg)](https://pypi.org/project/duckguard/)
[![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://xdatahubai.github.io/duckguard/)

In [None]:
!pip install -q duckguard

In [None]:
# Create sample data with intentional quality issues
ORDERS_CSV = """order_id,customer_id,product_name,quantity,unit_price,subtotal,tax,shipping,total_amount,status,country,email,phone,created_at,ship_date
ORD001,CUST001,Widget Pro,2,29.99,59.98,5.40,4.99,70.37,shipped,US,alice@example.com,555-0101,2024-01-15,2024-01-17
ORD002,CUST002,Gadget Plus,1,49.99,49.99,4.50,0.00,54.49,delivered,US,bob@example.com,555-0102,2024-01-15,2024-01-18
ORD003,,Widget Pro,-3,29.99,-89.97,-8.10,4.99,-93.08,pending,UK,charlie@example.com,+44-20-7946-0958,2024-01-16,
ORD004,CUST004,Super Gizmo,1,199.99,199.99,18.00,0.00,217.99,shipped,US,,555-0104,2024-01-16,2024-01-19
ORD005,CUST005,Widget Pro,500,29.99,14995.00,1349.55,4.99,16349.54,pending,CA,eve@example.com,555-0105,2024-01-17,
ORD006,CUST006,Gadget Plus,2,49.99,99.98,9.00,4.99,113.97,INVALID,US,frank@example.com,555-0106,2024-01-17,2024-01-20
ORD007,CUST007,Basic Widget,1,9.99,9.99,0.90,4.99,15.88,delivered,US,grace@example,555-0107,2024-01-18,2024-01-20
ORD008,CUST008,Premium Bundle,3,99.99,299.97,27.00,0.00,326.97,shipped,DE,hans@example.de,+49-30-12345678,2024-01-18,2024-01-22
ORD009,CUST009,Widget Pro,1,29.99,29.99,2.70,4.99,37.68,delivered,US,ivan@example.com,,2024-01-19,2024-01-21
ORD010,CUST010,Super Gizmo,2,199.99,399.98,36.00,0.00,435.98,pending,JP,jun@example.jp,+81-3-1234-5678,2024-01-19,
"""
with open("orders.csv", "w") as f:
    f.write(ORDERS_CSV.strip())
print("‚úÖ Sample data created")

## Connect & Validate

In [None]:
from duckguard import connect

orders = connect("orders.csv")
# Same API works for any source:
# orders = connect("s3://bucket/orders.parquet")
# orders = connect("snowflake://account/db", table="orders")
# orders = connect("databricks://host", table="orders")

print(f"Rows: {orders.row_count}, Columns: {len(orders.columns)}")

In [None]:
# Validate ‚Äî just like pytest assertions
checks = [
    ("order_id not null", orders.order_id.is_not_null()),
    ("order_id unique", orders.order_id.is_unique()),
    ("customer_id not null", orders.customer_id.is_not_null()),
    ("quantity in [1, 100]", orders.quantity.between(1, 100)),
    ("total_amount positive", orders.total_amount.greater_than(0)),
    ("status valid", orders.status.isin(["pending", "shipped", "delivered", "cancelled"])),
]

for name, result in checks:
    icon = "‚úÖ" if result.passed else "‚ùå"
    print(f"{icon} {name}")
    if not result.passed:
        print(f"   ‚Üí {result.summary()}")

## Quality Score & Profile

In [None]:
score = orders.score()
print(f"Grade: {score.grade} ({score.overall:.1f}/100)")
print(f"  Completeness: {score.completeness:.1f}%")
print(f"  Uniqueness:   {score.uniqueness:.1f}%")
print(f"  Validity:     {score.validity:.1f}%")
print(f"  Consistency:  {score.consistency:.1f}%")

In [None]:
from duckguard import AutoProfiler, SemanticAnalyzer

profile = AutoProfiler().profile(orders)
print(f"{'Column':<20} {'Nulls %':<10} {'Unique %':<10} {'Grade'}")
print("-" * 50)
for col in profile.columns:
    print(f"{col.name:<20} {col.null_percent:<10.1f} {col.unique_percent:<10.1f} {col.quality_grade}")

# PII detection
analysis = SemanticAnalyzer().analyze(orders)
if analysis.pii_columns:
    print(f"\n‚ö†Ô∏è  PII detected in: {analysis.pii_columns}")

## Anomaly Detection

In [None]:
from duckguard import detect_anomalies

report = detect_anomalies(orders, method="zscore", columns=["quantity", "total_amount"])
for a in report.anomalies:
    status = "üö®" if a.is_anomaly else "‚úÖ"
    print(f"{status} {a.column}: score={a.score:.2f}")

## Next Steps

- üìö [Full Documentation](https://xdatahubai.github.io/duckguard/)
- üîå [Snowflake/Databricks/BigQuery connectors](https://xdatahubai.github.io/duckguard/connectors/overview/)
- ü§ñ [AI-powered features](https://xdatahubai.github.io/duckguard/guide/ai-features/) ‚Äî explain, suggest, and fix data quality issues
- ‚≠ê [Star on GitHub](https://github.com/XDataHubAI/duckguard)