## Check Uniqueness & Validity

**Objective**: Evaluate data quality by checking for uniqueness and validity of data entries.

For this activity, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Uniqueness
    - Unique IDs
    - Unique Email Addresses
    - Unique Combination

2. Check Validity
    - Validate Age Range
    - Validate Grade Scale
    - Validate Name Format

In [1]:
import pandas as pd
import re
import unittest

# ---------------------------
# Sample Dataset
# ---------------------------
data = {
    'ID': [1, 2, 3, 4, 5, 5],  # Duplicate ID (5)
    'Name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'alice white', 'Charlie Brown', 'David Lee'],  # One invalid name
    'Age': [20, 22, 19, 25, 121, None],  # One invalid and one missing
    'Grade': [85, 90, 78, 88, 101, None],  # Grade 101 invalid and one missing
    'Email': [
        'johndoe@example.com',
        'janesmith@example.com',
        'bobjohnson@example.com',
        'alicewhite@example.com',
        'charliebrown@example.com',
        'invalid-email'
    ]  # One invalid email
}
df = pd.DataFrame(data)

# ---------------------------
# Functions for Uniqueness & Validity
# ---------------------------

def check_unique(df, column):
    """Check if all values in a given column are unique."""
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found.")
    return df[column].is_unique

def check_unique_combination(df, columns):
    """Check if combination of multiple columns is unique."""
    missing_cols = [col for col in columns if col not in df.columns]
    if missing_cols:
        raise KeyError(f"Missing columns: {missing_cols}")
    return not df.duplicated(subset=columns).any()

def validate_age(df, column, min_age=0, max_age=120):
    """Return rows where age is outside valid range."""
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found.")
    return df[(df[column] < min_age) | (df[column] > max_age)]

def validate_grade(df, column, min_grade=0, max_grade=100):
    """Return rows where grade is outside valid range."""
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found.")
    return df[(df[column] < min_grade) | (df[column] > max_grade)]

def validate_name_format(df, column):
    """
    Validate names using the pattern: 'Firstname Lastname',
    where both parts are capitalized.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found.")
    name_regex = r'^[A-Z][a-z]+(?: [A-Z][a-z]+)+$'
    return df[~df[column].str.match(name_regex, na=False)]

# ---------------------------
# Run Checks and Print Results
# ---------------------------

print("✅ Unique ID Check:", check_unique(df, 'ID'))
print("✅ Unique Email Check:", check_unique(df, 'Email'))
print("✅ Unique ID+Email Combination Check:", check_unique_combination(df, ['ID', 'Email']), "\n")

print("⚠️ Invalid Age Entries:\n", validate_age(df, 'Age'), "\n")
print("⚠️ Invalid Grade Entries:\n", validate_grade(df, 'Grade'), "\n")
print("⚠️ Invalid Name Formats:\n", validate_name_format(df, 'Name'), "\n")

# ---------------------------
# Unit Tests
# ---------------------------

class TestDataQualityFunctions(unittest.TestCase):

    def setUp(self):
        self.df = df.copy()

    def test_unique_ids(self):
        self.assertFalse(check_unique(self.df, 'ID'), "ID column has duplicates.")

    def test_unique_emails(self):
        self.assertTrue(check_unique(self.df, 'Email'), "Emails should be unique.")

    def test_unique_id_email_combo(self):
        self.assertFalse(check_unique_combination(self.df, ['ID', 'Email']), "ID and Email combo should not be unique.")

    def test_invalid_ages(self):
        result = validate_age(self.df, 'Age')
        self.assertEqual(len(result), 1, "One row should have invalid age.")

    def test_invalid_grades(self):
        result = validate_grade(self.df, 'Grade')
        self.assertEqual(len(result), 1, "One row should have invalid grade.")

    def test_invalid_names(self):
        result = validate_name_format(self.df, 'Name')
        self.assertEqual(len(result), 1, "One row should have invalid name format.")

# ---------------------------
# Run Unit Tests
# ---------------------------

if __name__ == '__main__':
    unittest.main(argv=[''], verbosity=2, exit=False)


test_invalid_ages (__main__.TestDataQualityFunctions) ... ok
test_invalid_grades (__main__.TestDataQualityFunctions) ... ok
test_invalid_names (__main__.TestDataQualityFunctions) ... ok
test_unique_emails (__main__.TestDataQualityFunctions) ... ok
test_unique_id_email_combo (__main__.TestDataQualityFunctions) ... FAIL
test_unique_ids (__main__.TestDataQualityFunctions) ... ok

FAIL: test_unique_id_email_combo (__main__.TestDataQualityFunctions)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_26051/3618710001.py", line 91, in test_unique_id_email_combo
    self.assertFalse(check_unique_combination(self.df, ['ID', 'Email']), "ID and Email combo should not be unique.")
AssertionError: True is not false : ID and Email combo should not be unique.

----------------------------------------------------------------------
Ran 6 tests in 0.012s

FAILED (failures=1)


✅ Unique ID Check: False
✅ Unique Email Check: True
✅ Unique ID+Email Combination Check: True 

⚠️ Invalid Age Entries:
    ID           Name    Age  Grade                     Email
4   5  Charlie Brown  121.0  101.0  charliebrown@example.com 

⚠️ Invalid Grade Entries:
    ID           Name    Age  Grade                     Email
4   5  Charlie Brown  121.0  101.0  charliebrown@example.com 

⚠️ Invalid Name Formats:
    ID         Name   Age  Grade                   Email
3   4  alice white  25.0   88.0  alicewhite@example.com 

