In [None]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

In [6]:
import pandas as pd
import re

# Sample dataset with dates, numeric, and string fields (replace with your own CSV file)
df = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice Williams'],
    'date_of_joining': ['12/31/2020', '2021-01-15', '15-02-2020', '2020/03/10'],  # Various date formats
    'age': [28, -5, 35, 40],  # Includes invalid negative age
    'email': ['john.doe@example.com', 'jane.smith@domain', 'bob.johnson@example.com', 'alice.williams@example']  # Invalid emails
})

print("Original Data:")
print(df)

# Task 13: Date Format Standardization
# Convert date_of_joining to uniform YYYY-MM-DD format
df['date_of_joining'] = pd.to_datetime(df['date_of_joining'], errors='coerce').dt.strftime('%Y-%m-%d')

print("\nAfter Date Format Standardization:")
print(df['date_of_joining'])

# Task 14: Numeric Constraints Enforcement (e.g., age > 0)
# Find invalid age entries and replace or flag them
invalid_ages = df['age'] <= 0
print("\nRows with invalid age (<= 0):")
print(df[invalid_ages])

# Replace invalid ages with NaN or another strategy
df.loc[invalid_ages, 'age'] = pd.NA

print("\nAfter enforcing numeric constraints on age:")
print(df['age'])

# Task 15: String Format Checks (valid email format)
# Simple regex for email validation
email_pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')

def validate_email(email):
    if pd.isna(email):
        return False
    return bool(email_pattern.match(email))

df['email_valid'] = df['email'].apply(validate_email)

print("\nEmail validation results (True=valid, False=invalid):")
print(df[['email', 'email_valid']])


Original Data:
   employee_id            name date_of_joining  age                    email
0            1        John Doe      12/31/2020   28     john.doe@example.com
1            2      Jane Smith      2021-01-15   -5        jane.smith@domain
2            3     Bob Johnson      15-02-2020   35  bob.johnson@example.com
3            4  Alice Williams      2020/03/10   40   alice.williams@example

After Date Format Standardization:
0    2020-12-31
1           NaN
2           NaN
3           NaN
Name: date_of_joining, dtype: object

Rows with invalid age (<= 0):
   employee_id        name date_of_joining  age              email
1            2  Jane Smith             NaN   -5  jane.smith@domain

After enforcing numeric constraints on age:
0    28.0
1     NaN
2    35.0
3    40.0
Name: age, dtype: float64

Email validation results (True=valid, False=invalid):
                     email  email_valid
0     john.doe@example.com         True
1        jane.smith@domain        False
2  bob.johns

In [7]:
from dateutil.parser import parse
import numpy as np

def parse_date_with_fallback(date_str):
    formats = ['%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y', '%Y/%m/%d']
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except (ValueError, TypeError):
            continue
    # Fallback to dateutil
    try:
        return parse(date_str)
    except (ValueError, TypeError):
        return np.nan

df['date_of_joining_parsed'] = df['date_of_joining'].apply(parse_date_with_fallback)
df['date_of_joining_parsed'] = pd.to_datetime(df['date_of_joining_parsed'], errors='coerce').dt.strftime('%Y-%m-%d')

# Flag invalid dates for review
invalid_dates = df['date_of_joining_parsed'].isna()
print(f"Invalid date rows:\n{df[invalid_dates]}")


Invalid date rows:
   employee_id            name date_of_joining   age                    email  \
1            2      Jane Smith             NaN   NaN        jane.smith@domain   
2            3     Bob Johnson             NaN  35.0  bob.johnson@example.com   
3            4  Alice Williams             NaN  40.0   alice.williams@example   

   email_valid date_of_joining_parsed  
1        False                    NaN  
2         True                    NaN  
3        False                    NaN  


In [8]:
invalid_ages = df['age'] <= 0
if invalid_ages.any():
    print("Invalid age entries detected:")
    print(df.loc[invalid_ages])
    df.loc[invalid_ages, 'age'] = np.nan
    # Export invalid age rows for manual review
    df.loc[invalid_ages].to_csv('invalid_age_entries.csv', index=False)


In [9]:
import numpy as np

email_regex = re.compile(
    r"(^[-!#$%&'*+/0-9=?A-Z^_a-z`{|}~]+(\.[-!#$%&'*+/0-9=?A-Z^_a-z`{|}~]+)*"
    r'|^"([!#-[\\]-~]*|\\[ -~])*")@([A-Za-z0-9-]+\.)+[A-Za-z]{2,}$'
)

# Vectorized email check
df['email_valid'] = np.where(df['email'].str.match(email_regex), True, False)

invalid_emails = df.loc[~df['email_valid']]
if not invalid_emails.empty:
    print("Invalid email entries:")
    print(invalid_emails)
    invalid_emails.to_csv('invalid_email_entries.csv', index=False)


Invalid email entries:
   employee_id            name date_of_joining   age                   email  \
1            2      Jane Smith             NaN   NaN       jane.smith@domain   
3            4  Alice Williams             NaN  40.0  alice.williams@example   

   email_valid date_of_joining_parsed  
1        False                    NaN  
3        False                    NaN  


In [2]:
from dateutil.parser import parse
import numpy as np

def robust_date_parse(date_str):
    try:
        return parse(date_str).date()
    except (ValueError, TypeError):
        return np.nan

df['date_of_joining'] = df['date_of_joining'].apply(robust_date_parse)
df['date_of_joining'] = pd.to_datetime(df['date_of_joining'], errors='coerce').dt.strftime('%Y-%m-%d')


In [3]:
df['age_valid'] = df['age'] > 0

# Optionally, filter or export invalid rows for manual correction
invalid_ages_df = df[~df['age_valid']]
print("Invalid ages (<=0) to be reviewed:")
print(invalid_ages_df)


Invalid ages (<=0) to be reviewed:
   employee_id        name date_of_joining  age              email  \
1            2  Jane Smith             NaN  NaN  jane.smith@domain   

   email_valid  age_valid  
1        False      False  


In [4]:
!pip install email-validator


Defaulting to user installation because normal site-packages is not writeable
Collecting email-validator
  Downloading email_validator-2.2.0-py3-none-any.whl (33 kB)
Collecting dnspython>=2.0.0
  Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: dnspython, email-validator
Successfully installed dnspython-2.7.0 email-validator-2.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
from email_validator import validate_email, EmailNotValidError

def is_valid_email(email):
    try:
        validate_email(email)
        return True
    except EmailNotValidError:
        return False

# Vectorized application
df['email_valid'] = df['email'].map(is_valid_email)


In [None]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).









