In [1]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

In [2]:
import pandas as pd
import numpy as np
import re

# Sample data with inconsistent date formats, ages, and emails
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'JoinDate': ['12/31/2022', '2023-01-15', '15-02-2023', 'March 5, 2023'],
    'Age': [25, -5, 30, 0],
    'Email': ['alice@example.com', 'bob_at_example.com', 'charlie@example.com', 'david@example']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# --- 13. Date Format Standardization (to YYYY-MM-DD) ---
def standardize_date(date_str):
    try:
        # Try parsing common formats
        return pd.to_datetime(date_str, errors='coerce').strftime('%Y-%m-%d')
    except:
        return np.nan

df['JoinDate'] = df['JoinDate'].apply(standardize_date)
print("\nAfter Date Format Standardization:")
print(df)

# --- 14. Numeric Constraints Enforcement (Age > 0) ---
df['Age_valid'] = df['Age'].apply(lambda x: x if x > 0 else np.nan)
print("\nAfter Enforcing Age > 0 Constraint:")
print(df)

# --- 15. String Format Checks: Validate Email Format ---
def is_valid_email(email):
    # Basic regex for email validation
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return email if re.match(pattern, email) else np.nan

df['Email_valid'] = df['Email'].apply(is_valid_email)
print("\nAfter Email Validation:")
print(df)


Original DataFrame:
      Name       JoinDate  Age                Email
0    Alice     12/31/2022   25    alice@example.com
1      Bob     2023-01-15   -5   bob_at_example.com
2  Charlie     15-02-2023   30  charlie@example.com
3    David  March 5, 2023    0        david@example

After Date Format Standardization:
      Name    JoinDate  Age                Email
0    Alice  2022-12-31   25    alice@example.com
1      Bob  2023-01-15   -5   bob_at_example.com
2  Charlie  2023-02-15   30  charlie@example.com
3    David  2023-03-05    0        david@example

After Enforcing Age > 0 Constraint:
      Name    JoinDate  Age                Email  Age_valid
0    Alice  2022-12-31   25    alice@example.com       25.0
1      Bob  2023-01-15   -5   bob_at_example.com        NaN
2  Charlie  2023-02-15   30  charlie@example.com       30.0
3    David  2023-03-05    0        david@example        NaN

After Email Validation:
      Name    JoinDate  Age                Email  Age_valid  \
0    Alice  20

  return pd.to_datetime(date_str, errors='coerce').strftime('%Y-%m-%d')


In [3]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).











In [4]:
import pandas as pd
import re

# Sample data with inconsistent date formats, phone numbers, and mixed case text
data = {
    'Name': ['Alice Smith', 'bob JONES', 'Charlie Brown', 'david JOHNSON'],
    'JoinDate': ['12/31/2022', '2023-01-15', '15-02-2023', 'March 5, 2023'],
    'Phone': ['1234567890', '(123) 456-7890', '123-456-7890', '123.456.7890']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# --- 16. Standardizing Date Formats to YYYY-MM-DD ---
def standardize_date(date_str):
    try:
        return pd.to_datetime(date_str, errors='coerce').strftime('%Y-%m-%d')
    except:
        return pd.NaT

df['JoinDate'] = df['JoinDate'].apply(standardize_date)
print("\nAfter Date Format Standardization:")
print(df)

# --- 17. Pattern Matching for Phone Numbers to format (123) 456-7890 ---
def standardize_phone(phone):
    # Remove all non-digit characters
    digits = re.sub(r'\D', '', phone)
    if len(digits) == 10:
        # Format as (123) 456-7890
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    else:
        return phone  # Return as-is if length not 10 digits

df['Phone'] = df['Phone'].apply(standardize_phone)
print("\nAfter Phone Number Standardization:")
print(df)

# --- 18. Handling Mixed Case Text: Convert all Names to Uppercase ---
df['Name'] = df['Name'].str.upper()
print("\nAfter Converting Names to Uppercase:")
print(df)


Original DataFrame:
            Name       JoinDate           Phone
0    Alice Smith     12/31/2022      1234567890
1      bob JONES     2023-01-15  (123) 456-7890
2  Charlie Brown     15-02-2023    123-456-7890
3  david JOHNSON  March 5, 2023    123.456.7890

After Date Format Standardization:
            Name    JoinDate           Phone
0    Alice Smith  2022-12-31      1234567890
1      bob JONES  2023-01-15  (123) 456-7890
2  Charlie Brown  2023-02-15    123-456-7890
3  david JOHNSON  2023-03-05    123.456.7890

After Phone Number Standardization:
            Name    JoinDate           Phone
0    Alice Smith  2022-12-31  (123) 456-7890
1      bob JONES  2023-01-15  (123) 456-7890
2  Charlie Brown  2023-02-15  (123) 456-7890
3  david JOHNSON  2023-03-05  (123) 456-7890

After Converting Names to Uppercase:
            Name    JoinDate           Phone
0    ALICE SMITH  2022-12-31  (123) 456-7890
1      BOB JONES  2023-01-15  (123) 456-7890
2  CHARLIE BROWN  2023-02-15  (123) 456-7890

  return pd.to_datetime(date_str, errors='coerce').strftime('%Y-%m-%d')
