## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [1]:
import pandas as pd
import re
import unittest

# ---------------------------
# 1. Sample Dataset Creation
# ---------------------------

data = {
    'ID': [1, 2, 3, 4, 5, 6],
    'Name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice White', 'Charlie Brown', 'David Lee'],
    'Age': [20, 22, 19, 25, 121, None],  # 121 and None are invalid
    'Grade': [85, 90, 78, 88, 95, None],  # One missing grade
    'Email': [
        'johndoe@example.com',
        'janesmith@example.com',
        'bobjohnson@example.com',
        'alicewhite@example.com',
        'charliebrown@example.com',
        'invalid-email'  # Invalid
    ]
}
df = pd.DataFrame(data)

# ---------------------------
# 2. Data Quality Functions
# ---------------------------

def check_null_values(df):
    """Return rows with any missing values."""
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    return df[df.isnull().any(axis=1)]

def check_numerical_accuracy(df, column, min_value, max_value):
    """Return rows where numerical values fall outside the specified range."""
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    if df[column].dtype not in ['int64', 'float64']:
        raise TypeError(f"Column '{column}' must be numeric.")
    return df[(df[column] < min_value) | (df[column] > max_value)]

def validate_email_format(df, column):
    """Return rows where email format is invalid."""
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    if not df[column].dtype == 'object':
        raise TypeError(f"Column '{column}' must be of type string.")
    email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return df[~df[column].str.match(email_regex, na=False)]

def check_age_validity(df, column, min_age=0, max_age=120):
    """Return rows with invalid age values."""
    return check_numerical_accuracy(df, column, min_age, max_age)

def check_column_missing(df, column):
    """Return rows where the specific column has missing values."""
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    return df[df[column].isnull()]

# ---------------------------
# 3. Run Checks and Show Output
# ---------------------------

print("\n---- Rows with Missing Values (Completeness) ----")
print(check_null_values(df), "\n")

print("---- Invalid Age Values (Accuracy) ----")
print(check_age_validity(df, 'Age'), "\n")

print("---- Invalid Email Formats (Accuracy) ----")
print(validate_email_format(df, 'Email'), "\n")

print("---- Missing Grade Column Values (Completeness) ----")
print(check_column_missing(df, 'Grade'), "\n")

# ---------------------------
# 4. Unit Tests
# ---------------------------

class TestDataQualityFunctions(unittest.TestCase):

    def setUp(self):
        self.df = df.copy()

    def test_check_null_values(self):
        result = check_null_values(self.df)
        self.assertEqual(len(result), 2, "Should detect 2 rows with missing values.")

    def test_check_numerical_accuracy(self):
        result = check_numerical_accuracy(self.df, 'Age', 0, 120)
        self.assertEqual(len(result), 1, "Should detect 1 invalid age (121).")

    def test_validate_email_format(self):
        result = validate_email_format(self.df, 'Email')
        self.assertEqual(len(result), 1, "Should detect 1 invalid email.")

    def test_check_age_validity(self):
        result = check_age_validity(self.df, 'Age')
        self.assertEqual(len(result), 1, "Should detect 1 invalid age using age checker.")

    def test_check_column_missing(self):
        result = check_column_missing(self.df, 'Grade')
        self.assertEqual(len(result), 1, "Should detect 1 missing grade.")

# ---------------------------
# 5. Run Tests
# ---------------------------

if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)


test_check_age_validity (__main__.TestDataQualityFunctions) ... ok
test_check_column_missing (__main__.TestDataQualityFunctions) ... ok
test_check_null_values (__main__.TestDataQualityFunctions) ... FAIL
test_check_numerical_accuracy (__main__.TestDataQualityFunctions) ... ok
test_validate_email_format (__main__.TestDataQualityFunctions) ... ok

FAIL: test_check_null_values (__main__.TestDataQualityFunctions)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_23619/2577655478.py", line 89, in test_check_null_values
    self.assertEqual(len(result), 2, "Should detect 2 rows with missing values.")
AssertionError: 1 != 2 : Should detect 2 rows with missing values.

----------------------------------------------------------------------
Ran 5 tests in 0.010s

FAILED (failures=1)



---- Rows with Missing Values (Completeness) ----
   ID       Name  Age  Grade          Email
5   6  David Lee  NaN    NaN  invalid-email 

---- Invalid Age Values (Accuracy) ----
   ID           Name    Age  Grade                     Email
4   5  Charlie Brown  121.0   95.0  charliebrown@example.com 

---- Invalid Email Formats (Accuracy) ----
   ID       Name  Age  Grade          Email
5   6  David Lee  NaN    NaN  invalid-email 

---- Missing Grade Column Values (Completeness) ----
   ID       Name  Age  Grade          Email
5   6  David Lee  NaN    NaN  invalid-email 

