## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [None]:
import pandas as pd
import re
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7],
    'Name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice Brown', 'Tom Wilson', None, 'Mike Davis'],
    'Age': [20, 21, None, 22, 23.5, 24, 25],
    'Grade': ['A', 'B', 'C', None, 'D', 'E', 'F'],
    'Email': ['john@example.com', 'jane@example.com', 'bob@example', 'alice@example.com', 'tom@example.com', 'not-an-email', 'mike@example.com']
}
df = pd.DataFrame(data)
def check_accuracy(df):
    print("\n=== DATA ACCURACY CHECKS ===")
    print("\nNumerical Data Statistics:")
    print(df[['Age']].describe())
    print("\nEmail Format Validation:")
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    valid_emails = df['Email'].apply(lambda x: bool(re.fullmatch(email_pattern, str(x))))
    print(f"Valid emails: {valid_emails.sum()}/{len(df)}")
    print("Invalid emails:")
    print(df[~valid_emails]['Email'])
    print("\nAge Integer Check:")
    non_integer_ages = df['Age'].apply(lambda x: not float(x).is_integer() if pd.notna(x) else False)
    print(f"Non-integer ages: {non_integer_ages.sum()}")
    print(df[non_integer_ages][['ID', 'Age']])
def check_completeness(df):
    print("\n=== DATA COMPLETENESS CHECKS ===")
    print("\nMissing Values per Column:")
    missing = df.isnull().sum()
    print(missing)
    print("\nRows with Missing Data:")
    incomplete_rows = df[df.isnull().any(axis=1)]
    print(incomplete_rows)
    print("\nMissing Value Percentages:")
    print(round(df.isnull().mean() * 100, 2))
check_accuracy(df)
check_completeness(df)