## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [1]:
import pandas as pd
import re

def check_accuracy_and_completeness(csv_file):
    """
    Checks the accuracy and completeness of data in a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        dict: A dictionary containing the results of the accuracy and completeness checks.
              The dictionary has the following keys:
                'numerical_accuracy': dict,  # Results of numerical data accuracy check
                'email_format_validity': dict,  # Results of email format validation
                'age_integer_accuracy': dict,  # Results of age integer accuracy check
                'missing_values': dict,  # Results of missing values identification
                'rows_with_missing_data': pandas.DataFrame,  # Rows with missing data
                'column_missing_value_check': dict # Missing value check per column
              Returns None if there are errors.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    results = {}  # Dictionary to store the results

    # 1. Check Accuracy
    #    - Verify Numerical Data Accuracy (for Grade)
    results['numerical_accuracy'] = {}
    if 'Grade' in df.columns:
        numerical_accuracy = df[df['Grade'].apply(lambda x: isinstance(x, (int, float)))]
        results['numerical_accuracy']['valid_grades'] = numerical_accuracy['Grade'].tolist()
        results['numerical_accuracy']['invalid_grades'] = df[~df['Grade'].apply(lambda x: isinstance(x, (int, float)))]['Grade'].tolist()
    else:
        results['numerical_accuracy']['error'] = "Column 'Grade' not found."

    #    - Validate Email Format
    results['email_format_validity'] = {}
    if 'Email' in df.columns:
        email_pattern = r"^[^\s@]+@[^\s@]+\.[^\s@]{2,}$"
        valid_emails = df[df['Email'].str.match(email_pattern, na=False)]['Email'].tolist()
        invalid_emails = df[~df['Email'].str.match(email_pattern, na=False)]['Email'].tolist()
        results['email_format_validity']['valid_emails'] = valid_emails
        results['email_format_validity']['invalid_emails'] = invalid_emails
    else:
        results['email_format_validity']['error'] = "Column 'Email' not found."

    #    - Integer Accuracy Check for Age
    results['age_integer_accuracy'] = {}
    if 'Age' in df.columns:
        integer_ages = df[df['Age'].apply(lambda x: isinstance(x, int))]['Age'].tolist()
        non_integer_ages = df[~df['Age'].apply(lambda x: isinstance(x, int))]['Age'].tolist()
        results['age_integer_accuracy']['integer_ages'] = integer_ages
        results['age_integer_accuracy']['non_integer_ages'] = non_integer_ages
    else:
        results['age_integer_accuracy']['error'] = "Column 'Age' not found."

    # 2. Check Completeness
    #    - Identify Missing Values
    results['missing_values'] = {}
    missing_value_counts = df.isnull().sum()
    for column, count in missing_value_counts.items():
        if count > 0:
            results['missing_values'][column] = count

    #    - Rows with Missing Data
    results['rows_with_missing_data'] = df[df.isnull().any(axis=1)]

    #    - Column Specific Missing Value Check
    results['column_missing_value_check'] = {}
    for col in df.columns:
        results['column_missing_value_check'][col] = df[col].isnull().sum()

    return results



def main():
    """
    Main function to run the accuracy and completeness check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'students.csv'  # Replace with your actual file path

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("ID,Name,Age,Grade,Email\n1,Alice,30,A,alice@example.com\n2,Bob,25,B,bob\n3,Charlie,30.5,C,charlie@test.org\n4,David,22,D,david@example.com\n5,Eve,,A,eve@sub.domain.com\n6,Frank,28,B,\n")
    except FileExistsError:
        pass

    # Check accuracy and completeness
    results = check_accuracy_and_completeness(csv_file)

    # Print the results
    if results is not None:
        print("Data Accuracy and Completeness Check Results:")

        print("\n1. Check Accuracy")
        print("\n   - Verify Numerical Data Accuracy (for Grade):")
        if 'error' in results['numerical_accuracy']:
            print(f"     {results['numerical_accuracy']['error']}")
        else:
            print(f"     Valid Grades: {results['numerical_accuracy']['valid_grades']}")
            print(f"     Invalid Grades: {results['numerical_accuracy']['invalid_grades']}")

        print("\n   - Validate Email Format:")
        if 'error' in results['email_format_validity']:
            print(f"     {results['email_format_validity']['error']}")
        else:
            print(f"     Valid Emails: {results['email_format_validity']['valid_emails']}")
            print(f"     Invalid Emails: {results['email_format_validity']['invalid_emails']}")

        print("\n   - Integer Accuracy Check for Age:")
        if 'error' in results['age_integer_accuracy']:
            print(f"     {results['age_integer_accuracy']['error']}")
        else:
            print(f"     Integer Ages: {results['age_integer_accuracy']['integer_ages']}")
            print(f"     Non-Integer Ages: {results['age_integer_accuracy']['non_integer_ages']}")

        print("\n2. Check Completeness")
        print("\n   - Identify Missing Values:")
        if results['missing_values']:
            for column, count in results['missing_values'].items():
                print(f"     Column '{column}': {count} missing values")
        else:
            print("     No missing values found.")

        print("\n   - Rows with Missing Data:")
        if not results['rows_with_missing_data'].empty:
            print(results['rows_with_missing_data'].to_string(index=False))
        else:
            print("     No rows with missing data.")

        print("\n   - Column Specific Missing Value Check:")
        for column, count in results['column_missing_value_check'].items():
            print(f"     Column '{column}': {count} missing values")
    else:
        print("An error occurred during the data quality check.")



if __name__ == "__main__":
    main()


Data Accuracy and Completeness Check Results:

1. Check Accuracy

   - Verify Numerical Data Accuracy (for Grade):
     Valid Grades: []
     Invalid Grades: ['A', 'B', 'C', 'D', 'A', 'B']

   - Validate Email Format:
     Valid Emails: ['alice@example.com', 'charlie@test.org', 'david@example.com', 'eve@sub.domain.com']
     Invalid Emails: ['bob', nan]

   - Integer Accuracy Check for Age:
     Integer Ages: []
     Non-Integer Ages: [30.0, 25.0, 30.5, 22.0, nan, 28.0]

2. Check Completeness

   - Identify Missing Values:
     Column 'Age': 1 missing values
     Column 'Email': 1 missing values

   - Rows with Missing Data:
 ID  Name  Age Grade              Email
  5   Eve  NaN     A eve@sub.domain.com
  6 Frank 28.0     B                NaN

   - Column Specific Missing Value Check:
     Column 'ID': 0 missing values
     Column 'Name': 0 missing values
     Column 'Age': 1 missing values
     Column 'Grade': 0 missing values
     Column 'Email': 1 missing values
