In [1]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).


import pandas as pd
import numpy as np
import re

# Sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'DOB': ['12-05-1998', '1990/01/25', '05.10.1985', '1987-07-13'],
    'Age': [25, -5, 38, 0],  # Age includes invalid values
    'Email': ['alice@example.com', 'bob[at]email.com', 'charlie@example', 'david@email.com']
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

# 13. Date Format Standardization
df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')  # convert to datetime
df['DOB'] = df['DOB'].dt.strftime('%Y-%m-%d')  # format as YYYY-MM-DD

print("\n13. Standardized Date Format (DOB):")
print(df[['Name', 'DOB']])

# 14. Numeric Constraints Enforcement (e.g., age > 0)
df['Valid_Age'] = df['Age'].apply(lambda x: x if x > 0 else np.nan)

print("\n14. Enforced Numeric Constraint on 'Age' (Age > 0):")
print(df[['Name', 'Age', 'Valid_Age']])

# 15. String Format Checks – validate email format
def is_valid_email(email):
    return bool(re.match(r'^[\w\.-]+@[\w\.-]+\.\w+$', email))

df['Valid_Email'] = df['Email'].apply(is_valid_email)

print("\n15. Email Format Validation:")
print(df[['Name', 'Email', 'Valid_Email']])

Original Data:
      Name         DOB  Age              Email
0    Alice  12-05-1998   25  alice@example.com
1      Bob  1990/01/25   -5   bob[at]email.com
2  Charlie  05.10.1985   38    charlie@example
3    David  1987-07-13    0    david@email.com

13. Standardized Date Format (DOB):
      Name         DOB
0    Alice  1998-12-05
1      Bob         NaN
2  Charlie         NaN
3    David         NaN

14. Enforced Numeric Constraint on 'Age' (Age > 0):
      Name  Age  Valid_Age
0    Alice   25       25.0
1      Bob   -5        NaN
2  Charlie   38       38.0
3    David    0        NaN

15. Email Format Validation:
      Name              Email  Valid_Email
0    Alice  alice@example.com         True
1      Bob   bob[at]email.com        False
2  Charlie    charlie@example        False
3    David    david@email.com         True


In [2]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).



import pandas as pd
import re

# Sample data with inconsistencies
data = {
    'Name': ['Alice', 'bOb', 'CHARLIE', 'david'],
    'JoinDate': ['12-05-2022', '2022/06/15', '15.07.2021', '2020-11-01'],
    'Phone': ['1234567890', '(123)456-7890', '123-456-7890', '123.456.7890']
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

# 16. Standardizing Date Formats
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce')  # Coerce invalid formats to NaT
df['JoinDate'] = df['JoinDate'].dt.strftime('%Y-%m-%d')

print("\n16. Standardized JoinDate Format:")
print(df[['Name', 'JoinDate']])

# 17. Pattern Matching for Phone Number Format: (123) 456-7890
def format_phone(phone):
    digits = re.sub(r'\D', '', phone)
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    return phone  # return as-is if not 10 digits

df['Phone_Standard'] = df['Phone'].apply(format_phone)

print("\n17. Standardized Phone Numbers:")
print(df[['Name', 'Phone', 'Phone_Standard']])

# 18. Handling Mixed Case Text – convert 'Name' column to uppercase
df['Name_Upper'] = df['Name'].str.upper()

print("\n18. Names in Uppercase:")
print(df[['Name', 'Name_Upper']])







Original Data:
      Name    JoinDate          Phone
0    Alice  12-05-2022     1234567890
1      bOb  2022/06/15  (123)456-7890
2  CHARLIE  15.07.2021   123-456-7890
3    david  2020-11-01   123.456.7890

16. Standardized JoinDate Format:
      Name    JoinDate
0    Alice  2022-12-05
1      bOb         NaN
2  CHARLIE         NaN
3    david         NaN

17. Standardized Phone Numbers:
      Name          Phone  Phone_Standard
0    Alice     1234567890  (123) 456-7890
1      bOb  (123)456-7890  (123) 456-7890
2  CHARLIE   123-456-7890  (123) 456-7890
3    david   123.456.7890  (123) 456-7890

18. Names in Uppercase:
      Name Name_Upper
0    Alice      ALICE
1      bOb        BOB
2  CHARLIE    CHARLIE
3    david      DAVID
