## Check Uniqueness & Validity

**Objective**: Evaluate data quality by checking for uniqueness and validity of data entries.

For this activity, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Uniqueness
    - Unique IDs
    - Unique Email Addresses
    - Unique Combination

2. Check Validity
    - Validate Age Range
    - Validate Grade Scale
    - Validate Name Format

In [2]:
import pandas as pd
import re

def check_uniqueness_and_validity(csv_file):
    """
    Checks the uniqueness and validity of data entries in a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        dict: A dictionary containing the results of the uniqueness and validity checks.
              The dictionary has the following keys:
                'unique_ids': bool,  # True if IDs are unique, False otherwise
                'unique_emails': bool,  # True if email addresses are unique, False otherwise
                'unique_combinations': bool,  # True if ID-Email combinations are unique, False otherwise
                'age_validity': dict,  # Results of age range validation
                'grade_validity': dict,  # Results of grade scale validation
                'name_format_validity': dict   # Results of name format validation
              Returns None if there are errors.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    results = {}  # Dictionary to store the results

    # 1. Check Uniqueness
    #    - Unique IDs
    results['unique_ids'] = df['ID'].is_unique if 'ID' in df.columns else "Column 'ID' not found"

    #    - Unique Email Addresses
    results['unique_emails'] = df['Email'].is_unique if 'Email' in df.columns else "Column 'Email' not found"

    #    - Unique ID-Email Combinations
    if 'ID' in df.columns and 'Email' in df.columns:
        results['unique_combinations'] = df.duplicated(subset=['ID', 'Email']).any()
    else:
        results['unique_combinations'] = "Columns 'ID' or 'Email' not found"

    # 2. Check Validity
    #    - Validate Age Range
    results['age_validity'] = {}
    if 'Age' in df.columns:
        valid_ages = df[(df['Age'] >= 0) & (df['Age'] <= 120)]['Age'].tolist()
        invalid_ages = df[(df['Age'] < 0) | (df['Age'] > 120)]['Age'].tolist()
        results['age_validity']['valid_ages'] = valid_ages
        results['age_validity']['invalid_ages'] = invalid_ages
    else:
        results['age_validity']['error'] = "Column 'Age' not found"

    #    - Validate Grade Scale
    results['grade_validity'] = {}
    if 'Grade' in df.columns:
        valid_grades = df[df['Grade'].isin(['A', 'B', 'C', 'D', 'F'])]['Grade'].tolist()
        invalid_grades = df[~df['Grade'].isin(['A', 'B', 'C', 'D', 'F'])]['Grade'].tolist()
        results['grade_validity']['valid_grades'] = valid_grades
        results['grade_validity']['invalid_grades'] = invalid_grades
    else:
        results['grade_validity']['error'] = "Column 'Grade' not found"

    # - Validate Name Format
    results['name_format_validity'] = {}
    if 'Name' in df.columns:
        name_pattern = r"^[a-zA-Z\s]+$"  # Allows only letters and spaces
        valid_names = df[df['Name'].str.match(name_pattern, na=False)]['Name'].tolist()
        invalid_names = df[~df['Name'].str.match(name_pattern, na=False)]['Name'].tolist()
        results['name_format_validity']['valid_names'] = valid_names
        results['name_format_validity']['invalid_names'] = invalid_names
    else:
        results['name_format_validity']['error'] = "Column 'Name' not found"

    return results



def main():
    """
    Main function to run the uniqueness and validity check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'students.csv'  # Replace with your actual file path

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("ID,Name,Age,Grade,Email\n1,Alice,30,A,alice@example.com\n2,Bob,25,B,bob@example.com\n3,Charlie,30,C,charlie@test.org\n4,David,121,D,david@example.com\n5,Eve,22,E,eve@sub.domain.com\n6,Frank,28,B,alice@example.com\n7,Grace,22,A,grace smith@email.com\n")
    except FileExistsError:
        pass

    # Check uniqueness and validity
    results = check_uniqueness_and_validity(csv_file)

    # Print the results
    if results is not None:
        print("Data Uniqueness and Validity Check Results:")

        print("\n1. Check Uniqueness")
        print(f"   - Unique IDs: {results['unique_ids']}")
        print(f"   - Unique Email Addresses: {results['unique_emails']}")
        print(f"   - Unique ID-Email Combinations: {results['unique_combinations']}")

        print("\n2. Check Validity")
        print("\n   - Validate Age Range:")
        if 'error' in results['age_validity']:
            print(f"     {results['age_validity']['error']}")
        else:
            print(f"     Valid Ages: {results['age_validity']['valid_ages']}")
            print(f"     Invalid Ages: {results['age_validity']['invalid_ages']}")

        print("\n   - Validate Grade Scale:")
        if 'error' in results['grade_validity']:
            print(f"     {results['grade_validity']['error']}")
        else:
            print(f"     Valid Grades: {results['grade_validity']['valid_grades']}")
            print(f"     Invalid Grades: {results['grade_validity']['invalid_grades']}")

        print("\n   - Validate Name Format:")
        if 'error' in results['name_format_validity']:
            print(f"     {results['name_format_validity']['error']}")
        else:
            print(f"     Valid Names: {results['name_format_validity']['valid_names']}")
            print(f"     Invalid Names: {results['name_format_validity']['invalid_names']}")
    else:
        print("An error occurred during the data quality check.")
