In [None]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

In [1]:
import pandas as pd
import numpy as np
import re

# Sample dataset with inconsistent formats and errors
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'DOB': ['01-02-1990', '1992/03/15', '15-Apr-1988', '1985-06-20', '07.07.1995'],
    'Age': [33, -5, 37, 39, 28],
    'Email': ['alice@example.com', 'bob[at]email.com', 'charlie@email.com', 'david@domain', 'eve123@site.org']
}
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)

# -------------------------------------
# 13. Date Format Standardization
# -------------------------------------
df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')  # Convert to datetime
df['DOB'] = df['DOB'].dt.strftime('%Y-%m-%d')            # Standardize format
print("\nStandardized Date Format (DOB):")
print(df[['Name', 'DOB']])

# -------------------------------------
# 14. Numeric Constraints Enforcement (Age > 0)
# -------------------------------------
# Replace invalid ages with NaN or fix them as needed
df['Age'] = df['Age'].apply(lambda x: x if x > 0 else np.nan)
print("\nAge Validation (Age > 0):")
print(df[['Name', 'Age']])

# -------------------------------------
# 15. String Format Checks (Valid Email Format)
# -------------------------------------
def is_valid_email(email):
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

df['Email_Valid'] = df['Email'].apply(is_valid_email)
print("\nEmail Format Check:")
print(df[['Name', 'Email', 'Email_Valid']])


Original Dataset:
      Name          DOB  Age              Email
0    Alice   01-02-1990   33  alice@example.com
1      Bob   1992/03/15   -5   bob[at]email.com
2  Charlie  15-Apr-1988   37  charlie@email.com
3    David   1985-06-20   39       david@domain
4      Eve   07.07.1995   28    eve123@site.org

Standardized Date Format (DOB):
      Name         DOB
0    Alice  1990-01-02
1      Bob         NaN
2  Charlie         NaN
3    David         NaN
4      Eve         NaN

Age Validation (Age > 0):
      Name   Age
0    Alice  33.0
1      Bob   NaN
2  Charlie  37.0
3    David  39.0
4      Eve  28.0

Email Format Check:
      Name              Email  Email_Valid
0    Alice  alice@example.com         True
1      Bob   bob[at]email.com        False
2  Charlie  charlie@email.com         True
3    David       david@domain        False
4      Eve    eve123@site.org         True


In [None]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).











In [2]:
import pandas as pd
import re

# Sample dataset with inconsistent formats
data = {
    'Name': ['alice', 'BOB', 'Charlie', 'DaVid', 'eve'],
    'JoinDate': ['2022/01/15', '15-02-2022', 'March 3, 2022', '2022.04.10', '2022-05-01'],
    'Phone': ['1234567890', '(123)4567890', '123-456-7890', '123 456 7890', '+1-123-456-7890']
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)

# -------------------------------------
# 16. Standardizing Date Formats
# -------------------------------------
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce')
df['JoinDate'] = df['JoinDate'].dt.strftime('%Y-%m-%d')
print("\nStandardized Join Dates (YYYY-MM-DD):")
print(df[['Name', 'JoinDate']])

# -------------------------------------
# 17. Pattern Matching for Consistency (Phone Numbers)
# -------------------------------------
def standardize_phone(phone):
    # Extract digits
    digits = re.sub(r'\D', '', phone)
    # Format as (123) 456-7890 if 10 digits
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    elif len(digits) == 11 and digits.startswith("1"):
        return f"({digits[1:4]}) {digits[4:7]}-{digits[7:]}"
    else:
        return "Invalid"

df['Phone'] = df['Phone'].apply(standardize_phone)
print("\nStandardized Phone Numbers:")
print(df[['Name', 'Phone']])

# -------------------------------------
# 18. Handling Mixed Case Text
# -------------------------------------
df['Name'] = df['Name'].str.upper()  # Convert all names to uppercase
print("\nNames Converted to Uppercase:")
print(df[['Name']])


Original Dataset:
      Name       JoinDate            Phone
0    alice     2022/01/15       1234567890
1      BOB     15-02-2022     (123)4567890
2  Charlie  March 3, 2022     123-456-7890
3    DaVid     2022.04.10     123 456 7890
4      eve     2022-05-01  +1-123-456-7890

Standardized Join Dates (YYYY-MM-DD):
      Name    JoinDate
0    alice  2022-01-15
1      BOB         NaN
2  Charlie         NaN
3    DaVid         NaN
4      eve         NaN

Standardized Phone Numbers:
      Name           Phone
0    alice  (123) 456-7890
1      BOB  (123) 456-7890
2  Charlie  (123) 456-7890
3    DaVid  (123) 456-7890
4      eve  (123) 456-7890

Names Converted to Uppercase:
      Name
0    ALICE
1      BOB
2  CHARLIE
3    DAVID
4      EVE
