**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [1]:
# Write your code from here
import pandas as pd
from io import StringIO

# Sample dataset with some missing values
data_csv = StringIO("""
id,name,email,phone
1,Alice,alice@example.com,555-1234
2,Bob,,555-5678
3,Charlie,charlie@example.com,
4,David,david@example.com,555-8765
""")

# Load dataset
df = pd.read_csv(data_csv)

# Check for null values in each column
null_counts = df.isnull().sum()

# Check if any null values exist
any_nulls = df.isnull().values.any()

# Output results
print("Null values per column:")
print(null_counts)
print("\nAny null values in dataset?", any_nulls)


Null values per column:
id       0
name     0
email    1
phone    1
dtype: int64

Any null values in dataset? True


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [3]:
import pandas as pd
from io import StringIO

data_csv = StringIO("""
id,name,age,email
1,Alice,30,alice@example.com
2,Bob,twentyfive,bob@example.com
3,Charlie,45,charlie@example.com
4,David,40.5,david@example.com
""")

df = pd.read_csv(data_csv)

def validate_column_type(df, column, expected_type):
    invalid_rows = []
    for i, val in df[column].items():  # Updated here
        try:
            expected_type(val)
        except (ValueError, TypeError):
            invalid_rows.append((i, val))
    return invalid_rows

invalid_ages = validate_column_type(df, 'age', int)

if not invalid_ages:
    print(f"✅ All '{'age'}' values are valid {int.__name__}s.")
else:
    print(f"⚠️ Found invalid '{'age'}' values:")
    for idx, val in invalid_ages:
        print(f"  Row {idx}: {val}")


⚠️ Found invalid 'age' values:
  Row 1: twentyfive
  Row 3: 40.5


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [4]:
# Write your code from here
import pandas as pd
from io import StringIO

# Sample dataset with duplicate emails
data_csv = StringIO("""
id,name,email
1,Alice,alice@example.com
2,Bob,bob@example.com
3,Charlie,alice@example.com
4,David,david@example.com
5,Eve,eve@example.com
""")

df = pd.read_csv(data_csv)

# Check for duplicates in 'email' column
duplicates = df[df.duplicated(subset=['email'], keep=False)]

if duplicates.empty:
    print("✅ All emails are unique.")
else:
    print(f"⚠️ Found {len(duplicates)} duplicate email entries:")
    print(duplicates[['id', 'name', 'email']])


⚠️ Found 2 duplicate email entries:
   id     name              email
0   1    Alice  alice@example.com
2   3  Charlie  alice@example.com


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [5]:
# Write your code from here
import pandas as pd
import re
from io import StringIO

# Sample dataset with some invalid emails
data_csv = StringIO("""
id,name,email
1,Alice,alice@example.com
2,Bob,bob_at_example.com
3,Charlie,charlie@example.com
4,David,david.example.com
5,Eve,eve@example.com
""")

df = pd.read_csv(data_csv)

# Simple regex pattern for email validation
email_pattern = re.compile(r"^[\w\.-]+@[\w\.-]+\.\w+$")

def validate_email_format(email):
    if isinstance(email, str) and email_pattern.match(email):
        return True
    return False

# Apply validation across the email column
df['email_valid'] = df['email'].apply(validate_email_format)

# Find invalid emails
invalid_emails = df[~df['email_valid']]

print(f"⚠️ Found {len(invalid_emails)} invalid email addresses:")
print(invalid_emails[['id', 'name', 'email']])


⚠️ Found 2 invalid email addresses:
   id   name               email
1   2    Bob  bob_at_example.com
3   4  David   david.example.com


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [6]:
# Write your code from here
import pandas as pd
from io import StringIO

# Sample dataset with some invalid ages
data_csv = StringIO("""
id,name,age
1,Alice,30
2,Bob,-5
3,Charlie,130
4,David,45
5,Eve,twenty
""")

df = pd.read_csv(data_csv)

def is_valid_age(age):
    try:
        age_num = int(age)
        return 0 <= age_num <= 120
    except (ValueError, TypeError):
        return False

# Apply validity check
df['age_valid'] = df['age'].apply(is_valid_age)

# Find invalid ages
invalid_ages = df[~df['age_valid']]

print(f"⚠️ Found {len(invalid_ages)} invalid age entries:")
print(invalid_ages[['id', 'name', 'age']])


⚠️ Found 3 invalid age entries:
   id     name     age
1   2      Bob      -5
2   3  Charlie     130
4   5      Eve  twenty


Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [7]:
# Write your code from here
import pandas as pd
from io import StringIO
import numpy as np

# Sample dataset with missing values (NaN)
data_csv = StringIO("""
id,name,age,salary
1,Alice,30,70000
2,Bob,,65000
3,Charlie,45,
4,David,40,72000
5,Eve,,68000
""")

df = pd.read_csv(data_csv)

# Identify missing values per column
missing_counts = df.isnull().sum()
print("Missing values per column:")
print(missing_counts)

# Impute missing values in 'age' and 'salary' with column mean
for col in ['age', 'salary']:
    if df[col].isnull().any():
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)
        print(f"\nImputed missing '{col}' values with mean: {mean_value:.2f}")

print("\nDataset after imputation:")
print(df)


Missing values per column:
id        0
name      0
age       2
salary    1
dtype: int64

Imputed missing 'age' values with mean: 38.33

Imputed missing 'salary' values with mean: 68750.00

Dataset after imputation:
   id     name        age   salary
0   1    Alice  30.000000  70000.0
1   2      Bob  38.333333  65000.0
2   3  Charlie  45.000000  68750.0
3   4    David  40.000000  72000.0
4   5      Eve  38.333333  68000.0


Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [8]:
# Write your code from here
import pandas as pd
from io import StringIO

# Sample dataset with duplicate rows
data_csv = StringIO("""
id,name,email
1,Alice,alice@example.com
2,Bob,bob@example.com
3,Charlie,charlie@example.com
4,Alice,alice@example.com
5,Eve,eve@example.com
""")

df = pd.read_csv(data_csv)

# Detect duplicate rows (considering all columns)
duplicates = df[df.duplicated(keep=False)]

if duplicates.empty:
    print("✅ No duplicate rows found.")
else:
    print(f"⚠️ Found {len(duplicates)} duplicate rows:")
    print(duplicates)


✅ No duplicate rows found.


Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [9]:
# Write your code from here
import pandas as pd
from io import StringIO

# Sample dataset
data_csv = StringIO("""
id,temperature,humidity
1,25,45
2,150,50
3,30,-10
4,20,55
5,18,60
""")

df = pd.read_csv(data_csv)

# Define valid ranges for each numerical column
valid_ranges = {
    'temperature': (0, 100),  # degrees Celsius
    'humidity': (0, 100)      # percentage
}

def validate_range(df, column, min_val, max_val):
    invalid = df[(df[column] < min_val) | (df[column] > max_val)]
    return invalid

# Check each column and collect invalid rows
invalid_entries = pd.DataFrame()
for col, (min_val, max_val) in valid_ranges.items():
    invalid = validate_range(df, col, min_val, max_val)
    if not invalid.empty:
        print(f"⚠️ Invalid values found in '{col}' (valid range: {min_val}-{max_val}):")
        print(invalid[['id', col]])
        invalid_entries = pd.concat([invalid_entries, invalid])

if invalid_entries.empty:
    print("✅ All numerical values are within specified ranges.")


⚠️ Invalid values found in 'temperature' (valid range: 0-100):
   id  temperature
1   2          150
⚠️ Invalid values found in 'humidity' (valid range: 0-100):
   id  humidity
2   3       -10


Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [10]:
# Write your code from here
import pandas as pd
from io import StringIO

# Sample dataset with missing mandatory fields
data_csv = StringIO("""
id,name,email,phone
1,Alice,alice@example.com,123-456-7890
2,Bob,,987-654-3210
3,Charlie,charlie@example.com,
4,David,, 
5,Eve,eve@example.com,555-555-5555
""")

df = pd.read_csv(data_csv)

# Define mandatory fields
mandatory_fields = ['name', 'email', 'phone']

def completeness_violations(df, mandatory_fields):
    violations = []
    for idx, row in df.iterrows():
        missing_fields = [field for field in mandatory_fields if pd.isna(row[field]) or str(row[field]).strip() == '']
        if missing_fields:
            violations.append({'row_index': idx, 'missing_fields': missing_fields})
    return violations

violations = completeness_violations(df, mandatory_fields)

if not violations:
    print("✅ No completeness violations found.")
else:
    print(f"⚠️ Found {len(violations)} rows violating completeness rules:")
    for v in violations:
        print(f"Row {v['row_index']} missing mandatory fields: {v['missing_fields']}")


⚠️ Found 3 rows violating completeness rules:
Row 1 missing mandatory fields: ['email']
Row 2 missing mandatory fields: ['phone']
Row 3 missing mandatory fields: ['email', 'phone']


Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [11]:
# Write your code from here
import pandas as pd
import re
from io import StringIO

# Sample dataset with phone numbers
data_csv = StringIO("""
id,name,phone
1,Alice,+1-800-555-1234
2,Bob,(800)555-1234 ext.123
3,Charlie,800.555.1234x456
4,David,8005551234
5,Eve,555-1234
6,Frank,123-45-6789
""")

df = pd.read_csv(data_csv)

# Advanced regex pattern for US phone numbers with optional country code and extension
phone_pattern = re.compile(
    r"""^                    # start of string
    (\+1[-.\s]?)*            # optional country code +1
    \(?(\d{3})\)?[-.\s]?     # area code with optional parentheses
    (\d{3})[-.\s]?           # first 3 digits
    (\d{4})                  # last 4 digits
    (\s*(ext|x|ext.)\s*\d+)? # optional extension
    $                        # end of string
    """,
    re.VERBOSE | re.IGNORECASE
)

def validate_phone(phone):
    if isinstance(phone, str) and phone_pattern.match(phone.strip()):
        return True
    return False

df['phone_valid'] = df['phone'].apply(validate_phone)

invalid_phones = df[~df['phone_valid']]

print(f"⚠️ Found {len(invalid_phones)} invalid phone numbers:")
print(invalid_phones[['id', 'name', 'phone']])


⚠️ Found 2 invalid phone numbers:
   id   name        phone
4   5    Eve     555-1234
5   6  Frank  123-45-6789
