In [None]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

In [1]:
import pandas as pd
import re

# Sample dataset with dates, numeric, and string fields (replace with your own CSV file)
df = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice Williams'],
    'date_of_joining': ['12/31/2020', '2021-01-15', '15-02-2020', '2020/03/10'],  # Various date formats
    'age': [28, -5, 35, 40],  # Includes invalid negative age
    'email': ['john.doe@example.com', 'jane.smith@domain', 'bob.johnson@example.com', 'alice.williams@example']  # Invalid emails
})

print("Original Data:")
print(df)

# Task 13: Date Format Standardization
# Convert date_of_joining to uniform YYYY-MM-DD format
df['date_of_joining'] = pd.to_datetime(df['date_of_joining'], errors='coerce').dt.strftime('%Y-%m-%d')

print("\nAfter Date Format Standardization:")
print(df['date_of_joining'])

# Task 14: Numeric Constraints Enforcement (e.g., age > 0)
# Find invalid age entries and replace or flag them
invalid_ages = df['age'] <= 0
print("\nRows with invalid age (<= 0):")
print(df[invalid_ages])

# Replace invalid ages with NaN or another strategy
df.loc[invalid_ages, 'age'] = pd.NA

print("\nAfter enforcing numeric constraints on age:")
print(df['age'])

# Task 15: String Format Checks (valid email format)
# Simple regex for email validation
email_pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')

def validate_email(email):
    if pd.isna(email):
        return False
    return bool(email_pattern.match(email))

df['email_valid'] = df['email'].apply(validate_email)

print("\nEmail validation results (True=valid, False=invalid):")
print(df[['email', 'email_valid']])


Original Data:
   employee_id            name date_of_joining  age                    email
0            1        John Doe      12/31/2020   28     john.doe@example.com
1            2      Jane Smith      2021-01-15   -5        jane.smith@domain
2            3     Bob Johnson      15-02-2020   35  bob.johnson@example.com
3            4  Alice Williams      2020/03/10   40   alice.williams@example

After Date Format Standardization:
0    2020-12-31
1           NaN
2           NaN
3           NaN
Name: date_of_joining, dtype: object

Rows with invalid age (<= 0):
   employee_id        name date_of_joining  age              email
1            2  Jane Smith             NaN   -5  jane.smith@domain

After enforcing numeric constraints on age:
0    28.0
1     NaN
2    35.0
3    40.0
Name: age, dtype: float64

Email validation results (True=valid, False=invalid):
                     email  email_valid
0     john.doe@example.com         True
1        jane.smith@domain        False
2  bob.johns

In [None]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).









