## Real-World Case Studies

### Finance - Fraud Detection Models:
**Description**: Analyze a financial dataset, define SLAs for data accuracy and
completeness, and ensure high data quality for fraud detection models.

In [1]:
import pandas as pd
import numpy as np

# -----------------------------
# Step 1: Create a sample financial transactions dataset
# -----------------------------
def create_transaction_data():
    data = {
        'transaction_amount': [100, 250, np.nan, 75, -20],  # -20 is invalid
        'timestamp': pd.to_datetime([
            '2025-05-20 12:00:00',
            '2025-05-20 12:05:00',
            '2025-05-20 12:10:00',
            '2025-05-20 12:15:00',
            '2025-05-20 12:20:00'
        ]),
        'ingestion_time': pd.to_datetime([
            '2025-05-20 12:10:00',
            '2025-05-20 12:10:00',
            '2025-05-20 12:15:00',
            '2025-05-20 12:30:00',
            '2025-05-20 12:40:00'
        ]),
        'customer_id': ['C1', 'C2', 'C1', None, 'C4'],
        'merchant_id': ['M1', 'M2', 'M1', 'M2', 'M3']
    }
    return pd.DataFrame(data)

# -----------------------------
# Step 2: Data Quality Checks
# -----------------------------
def check_data_quality(df):
    # Check 1: Accuracy - transaction_amount should be numeric and >= 0
    accuracy_check = df['transaction_amount'].apply(lambda x: isinstance(x, (int, float)) and x >= 0)
    accuracy_score = accuracy_check.mean()

    # Check 2: Completeness - no missing values in critical fields
    completeness_score = df[['transaction_amount', 'customer_id']].notnull().mean().mean()

    print("✅ Accuracy Score (valid transaction amounts):", round(accuracy_score * 100, 2), "%")
    print("✅ Completeness Score (non-missing critical fields):", round(completeness_score * 100, 2), "%")

    return accuracy_score, completeness_score

# -----------------------------
# Step 3: Error Handling
# -----------------------------
def safe_run():
    try:
        df = create_transaction_data()

        # Check if required columns exist
        required_columns = ['transaction_amount', 'customer_id', 'timestamp', 'ingestion_time']
        for col in required_columns:
            if col not in df.columns:
                raise ValueError(f"Missing required column: {col}")

        accuracy_score, completeness_score = check_data_quality(df)

        # Optional: Basic Unit Tests
        assert accuracy_score >= 0.6, "Accuracy too low"
        assert completeness_score >= 0.8, "Completeness too low"

        print("✅ All checks passed.")

    except Exception as e:
        print("❌ Error during data quality validation:", str(e))

# -----------------------------
# Execute
# -----------------------------
safe_run()


✅ Accuracy Score (valid transaction amounts): 60.0 %
✅ Completeness Score (non-missing critical fields): 80.0 %
✅ All checks passed.
