In [None]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).
# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).
# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format)
import pandas as pd
import numpy as np
import re
def create_sample_dataframe():
    data = {
        'ID': [1, 2, 3, 4, 5],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Age': [25, -30, 28, 22, 120],
        'DOB': ['2000-01-01', '1995/05/15', '1988-12-25', '1978-03-10', '2002-07-20'],
        'Email': ['alice@example.com', 'bob.invalid', 'charlie@test.org', 'david@123.com', 'eve@example.co.in'],
        'Phone': ['123-456-7890', '9876543210', '555-123-4567', 'N/A', '111 222 3333']
    }
    return pd.DataFrame(data)
def standardize_date_format(df, date_column='DOB', target_format='%Y-%m-%d'):
    df_formatted = df.copy()
    df_formatted[date_column] = pd.to_datetime(df_formatted[date_column], errors='coerce').dt.strftime(target_format)
    print(f"\nDataFrame after date format standardization ({target_format}):")
    print(df_formatted)
    return df_formatted
def enforce_numeric_constraints(df, numeric_column='Age', min_value=0, max_value=100):
    df_constrained = df.copy()
    df_constrained[numeric_column] = np.where(
        (df_constrained[numeric_column] < min_value) | (df_constrained[numeric_column] > max_value),
        np.nan, 
        df_constrained[numeric_column]
    )
    print(f"\nDataFrame after enforcing numeric constraints ({numeric_column} between {min_value} and {max_value}):")
    print(df_constrained)
    return df_constrained
def check_string_format(df, string_column='Email', regex=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'):
    df_checked = df.copy()
    df_checked['Email_Valid'] = df_checked[string_column].str.match(regex, na=False) # na=False important
    print(f"\nDataFrame after checking string format ({string_column} with regex '{regex}'):")
    print(df_checked)
    return df_checked
def main():
    df = create_sample_dataframe()
    print("\nOriginal DataFrame:")
    print(df)
    df_dates_standardized = standardize_date_format(df)
    df_numeric_constrained = enforce_numeric_constraints(df_dates_standardized)
    df_string_checked = check_string_format(df_numeric_constrained)
    print("\nReturned DataFrame from standardize_date_format():")
    print(df_dates_standardized)
    print("\nReturned DataFrame from enforce_numeric_constraints():")
    print(df_numeric_constrained)
    print("\nReturned DataFrame from check_string_format():")
    print(df_string_checked)
if __name__ == "__main__":
    main()


Original DataFrame:
   ID     Name  Age         DOB              Email         Phone
0   1    Alice   25  2000-01-01  alice@example.com  123-456-7890
1   2      Bob  -30  1995/05/15        bob.invalid    9876543210
2   3  Charlie   28  1988-12-25   charlie@test.org  555-123-4567
3   4    David   22  1978-03-10      david@123.com           N/A
4   5      Eve  120  2002-07-20  eve@example.co.in  111 222 3333

DataFrame after date format standardization (%Y-%m-%d):
   ID     Name  Age         DOB              Email         Phone
0   1    Alice   25  2000-01-01  alice@example.com  123-456-7890
1   2      Bob  -30         NaN        bob.invalid    9876543210
2   3  Charlie   28  1988-12-25   charlie@test.org  555-123-4567
3   4    David   22  1978-03-10      david@123.com           N/A
4   5      Eve  120  2002-07-20  eve@example.co.in  111 222 3333

DataFrame after enforcing numeric constraints (Age between 0 and 100):
   ID     Name   Age         DOB              Email         Phone
0   

In [2]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.

# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).

# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).
import pandas as pd
import numpy as np
import re
def create_sample_dataframe():
    data = {
        'ID': [1, 2, 3, 4, 5],
        'Name': ['Alice', 'bOb', 'Charlie', 'David', 'Eve'],
        'Age': [25, 30, 28, 22, 120],
        'DOB': ['2000-01-01', '1995/05/15', '1988.12.25', '1978-03-10', '2002/07/20'],
        'Email': ['alice@example.com', 'bob.invalid', 'charlie@test.org', 'david@123.com', 'eve@example.co.in'],
        'Phone': ['123-456-7890', '9876543210', '(555) 123-4567', 'N/A', '111 222 3333']
    }
    return pd.DataFrame(data)
def standardize_date_formats(df, date_column='DOB', target_format='%Y-%m-%d'):
    """
    Standardizes various date formats in a DataFrame column to a uniform format.

    Args:
        df (pd.DataFrame): The input DataFrame.
        date_column (str, optional): The name of the date column. Defaults to 'DOB'.
        target_format (str, optional): The desired date format. Defaults to '%Y-%m-%d'.

    Returns:
        pd.DataFrame: A new DataFrame with the date column standardized.
    """
    df_formatted = df.copy()
    formats = ['%Y-%m-%d', '%Y/%m/%d', '%Y.%m.%d', '%Y-%d-%m', '%Y/%d/%m', '%Y.%d.%m']
    for fmt in formats:
        df_formatted[date_column] = pd.to_datetime(df_formatted[date_column], format=fmt, errors='coerce')
    df_formatted[date_column] = df_formatted[date_column].dt.strftime(target_format)
    print(f"\nDataFrame after standardizing date formats to {target_format}:")
    print(df_formatted)
    return df_formatted


def standardize_phone_numbers(df, phone_column='Phone', target_pattern=r'(\d{3}) \d{3}-\d{4}'):
    """
    Standardizes phone numbers in a DataFrame column to a specific pattern.

    Args:
        df (pd.DataFrame): The input DataFrame.
        phone_column (str, optional): The name of the phone number column. Defaults to 'Phone'.
        target_pattern (str, optional): The desired phone number pattern (regex).
            Defaults to r'(\d{3}) \d{3}-\d{4}'.

    Returns:
        pd.DataFrame: A new DataFrame with standardized phone numbers.
    """
    df_standardized = df.copy()
    def format_phone(phone):
        if isinstance(phone, str):
            numeric_phone = re.sub(r'\D', '', phone)
            if len(numeric_phone) == 10:
                return f'({numeric_phone[:3]}) {numeric_phone[3:6]}-{numeric_phone[6:]}'
            elif numeric_phone.lower() == 'na':
                return 'N/A'
            else:
                return None 
        else:
            return None
    df_standardized[phone_column] = df_standardized[phone_column].apply(format_phone)

    print(f"\nDataFrame after standardizing phone numbers to {target_pattern}:")
    print(df_standardized)
    return df_standardized
def standardize_text_case(df, text_column='Name', case='upper'):
    """
    Converts text entries in a DataFrame column to a consistent case.

    Args:
        df (pd.DataFrame): The input DataFrame.
        text_column (str, optional): The name of the text column. Defaults to 'Name'.
        case (str, optional): The desired case ('upper', 'lower', or 'title').
            Defaults to 'upper'.

    Returns:
        pd.DataFrame: A new DataFrame with standardized text case.
    """
    df_cased = df.copy()
    if case == 'upper':
        df_cased[text_column] = df_cased[text_column].str.upper()
    elif case == 'lower':
        df_cased[text_column] = df_cased[text_column].str.lower()
    elif case == 'title':
        df_cased[text_column] = df_cased[text_column].str.title()
    else:
        print("Error: Invalid case argument.  Please choose 'upper', 'lower', or 'title'.")
        return df 
    print(f"\nDataFrame after converting text case in column '{text_column}' to {case}:")
    print(df_cased)
    return df_cased
def main():
    df = create_sample_dataframe()
    print("\nOriginal DataFrame:")
    print(df)
    df_dates_standardized = standardize_date_formats(df)
    df_phones_standardized = standardize_phone_numbers(df_dates_standardized)
    df_text_upper = standardize_text_case(df_phones_standardized, text_column='Name', case='upper')
    print("\nReturned DataFrame from standardize_date_formats():")
    print(df_dates_standardized)
    print("\nReturned DataFrame from standardize_phone_numbers():")
    print(df_phones_standardized)
    print("\nReturned DataFrame from standardize_text_case():")
    print(df_text_upper)
if __name__ == "__main__":
    main()



Original DataFrame:
   ID     Name  Age         DOB              Email           Phone
0   1    Alice   25  2000-01-01  alice@example.com    123-456-7890
1   2      bOb   30  1995/05/15        bob.invalid      9876543210
2   3  Charlie   28  1988.12.25   charlie@test.org  (555) 123-4567
3   4    David   22  1978-03-10      david@123.com             N/A
4   5      Eve  120  2002/07/20  eve@example.co.in    111 222 3333

DataFrame after standardizing date formats to %Y-%m-%d:
   ID     Name  Age         DOB              Email           Phone
0   1    Alice   25  2000-01-01  alice@example.com    123-456-7890
1   2      bOb   30         NaN        bob.invalid      9876543210
2   3  Charlie   28         NaN   charlie@test.org  (555) 123-4567
3   4    David   22  1978-03-10      david@123.com             N/A
4   5      Eve  120         NaN  eve@example.co.in    111 222 3333

DataFrame after standardizing phone numbers to (\d{3}) \d{3}-\d{4}:
   ID     Name  Age         DOB              Emai