In [4]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

In [5]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'DOB': ['01-02-1990', '1992/03/04', 'March 5, 1995', '07-06-1993'],
    'Age': [25, -3, 130, 40],
    'Email': ['alice@example.com', 'bob[at]example.com', 'charlie@example', 'david@domain.com']
}

df = pd.DataFrame(data)
# Convert DOB to datetime
df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')

# Format as YYYY-MM-DD
df['DOB'] = df['DOB'].dt.strftime('%Y-%m-%d')

print("Standardized Dates:")
print(df[['Name', 'DOB']])
# Replace invalid ages with NaN
df['Age'] = df['Age'].apply(lambda x: x if 0 < x < 120 else np.nan)

print("\nAge after enforcing numeric constraints (0 < age < 120):")
print(df[['Name', 'Age']])
import re

def is_valid_email(email):
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return bool(re.match(pattern, email))

df['Valid_Email'] = df['Email'].apply(is_valid_email)

print("\nEmail Validation:")
print(df[['Name', 'Email', 'Valid_Email']])


Standardized Dates:
      Name         DOB
0    Alice  1990-01-02
1      Bob         NaN
2  Charlie         NaN
3    David  1993-07-06

Age after enforcing numeric constraints (0 < age < 120):
      Name   Age
0    Alice  25.0
1      Bob   NaN
2  Charlie   NaN
3    David  40.0

Email Validation:
      Name               Email  Valid_Email
0    Alice   alice@example.com         True
1      Bob  bob[at]example.com        False
2  Charlie     charlie@example        False
3    David    david@domain.com         True


In [6]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).











In [7]:
import pandas as pd

data = {
    'Name': ['Alice', 'BOB', 'charLie', 'DAVID'],
    'JoinDate': ['01/02/2020', '2020-03-04', 'April 5, 2021', '06-07-2022'],
    'Phone': ['1234567890', '123-456-7890', '(123)4567890', '123.456.7890']
}

df = pd.DataFrame(data)
# Convert to datetime format (auto-detect formats)
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce')

# Format as YYYY-MM-DD
df['JoinDate'] = df['JoinDate'].dt.strftime('%Y-%m-%d')

print("Standardized JoinDate column:")
print(df[['Name', 'JoinDate']])
import re

def standardize_phone(phone):
    digits = re.sub(r'\D', '', phone)  # Remove non-digits
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    else:
        return "Invalid"

df['Phone'] = df['Phone'].apply(standardize_phone)

print("\nStandardized Phone Numbers:")
print(df[['Name', 'Phone']])
df['Name'] = df['Name'].str.upper()

print("\nNames converted to uppercase:")
print(df['Name'])


Standardized JoinDate column:
      Name    JoinDate
0    Alice  2020-01-02
1      BOB         NaN
2  charLie         NaN
3    DAVID         NaN

Standardized Phone Numbers:
      Name           Phone
0    Alice  (123) 456-7890
1      BOB  (123) 456-7890
2  charLie  (123) 456-7890
3    DAVID  (123) 456-7890

Names converted to uppercase:
0      ALICE
1        BOB
2    CHARLIE
3      DAVID
Name: Name, dtype: object
