**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [11]:
import pandas as pd

# Sample dataset with some missing values
data = {
    "Name": ["Alice", "Bob", None, "David"],
    "Age": [25, None, 30, 22],
    "Email": ["alice@example.com", "bob@example.com", "charlie@example.com", None]
}

df = pd.DataFrame(data)

# Check for null values (True if null)
null_mask = df.isnull()

print("Null Values in Dataset:")
print(null_mask)

# Count total nulls per column
null_counts = df.isnull().sum()

print("\nCount of Null Values per Column:")
print(null_counts)

# Check if dataset has any null values
if df.isnull().values.any():
    print("\nDataset contains missing values.")
else:
    print("\nDataset is complete with no missing values.")

Null Values in Dataset:
    Name    Age  Email
0  False  False  False
1  False   True  False
2   True  False  False
3  False  False   True

Count of Null Values per Column:
Name     1
Age      1
Email    1
dtype: int64

Dataset contains missing values.


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [12]:
import pandas as pd

# Sample data with some type issues (age has a string value)
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, "Thirty", 30, 22],  # Note: "Thirty" is invalid
    "Email": ["alice@example.com", "bob@example.com", "charlie@example.com", "david@example.com"]
}

df = pd.DataFrame(data)

# Expected types per column
expected_types = {
    "Name": str,
    "Age": int,
    "Email": str
}

def check_dtype(value, expected_type):
    try:
        # For int, float etc., try to cast
        if expected_type == int:
            int(value)
        elif expected_type == float:
            float(value)
        elif expected_type == str:
            str(value)
        else:
            return False
        return True
    except:
        return False

# Validate each cell against expected type
type_validation = pd.DataFrame()
for col, exp_type in expected_types.items():
    type_validation[col] = df[col].apply(lambda x: check_dtype(x, exp_type))

print("Data Type Validity per Cell (True=valid, False=invalid):")
print(type_validation)

# Rows with any invalid data type
invalid_rows = type_validation[~type_validation.all(axis=1)]
print("\nRows with Invalid Data Types:")
print(df.loc[invalid_rows.index])

Data Type Validity per Cell (True=valid, False=invalid):
   Name    Age  Email
0  True   True   True
1  True  False   True
2  True   True   True
3  True   True   True

Rows with Invalid Data Types:
  Name     Age            Email
1  Bob  Thirty  bob@example.com


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [13]:
import pandas as pd

# Sample data with duplicate emails
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Email": ["alice@example.com", "bob@example.com", "alice@example.com", "david@example.com"]
}

df = pd.DataFrame(data)

# Check if 'Email' column has unique values
is_unique = df["Email"].is_unique

print(f"Is 'Email' column unique? {is_unique}")

if not is_unique:
    # Find duplicated emails
    duplicates = df[df["Email"].duplicated(keep=False)]
    print("\nDuplicate Emails Found:")
    print(duplicates)
else:
    print("\nNo duplicates found in 'Email' column.")

Is 'Email' column unique? False

Duplicate Emails Found:
      Name              Email
0    Alice  alice@example.com
2  Charlie  alice@example.com


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [14]:
import pandas as pd
import re

# Sample data with valid and invalid emails
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Email": ["alice@example.com", "bob.example.com", "charlie@example", "david@example.com"]
}

df = pd.DataFrame(data)

# Simple regex pattern for email validation
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

# Function to validate emails using regex
def validate_email(email):
    if re.match(email_pattern, email):
        return True
    else:
        return False

# Apply validation
df['Email_Valid'] = df['Email'].apply(validate_email)

print(df)

# Show invalid emails
invalid_emails = df[~df['Email_Valid']]
print("\nInvalid Email Addresses:")
print(invalid_emails[['Name', 'Email']])

      Name              Email  Email_Valid
0    Alice  alice@example.com         True
1      Bob    bob.example.com        False
2  Charlie    charlie@example        False
3    David  david@example.com         True

Invalid Email Addresses:
      Name            Email
1      Bob  bob.example.com
2  Charlie  charlie@example


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [15]:
import pandas as pd

# Sample data with some invalid ages
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, -5, 130, 45]
}

df = pd.DataFrame(data)

# Define valid age range
min_age = 0
max_age = 120

# Check which ages are valid
df['Age_Valid'] = df['Age'].between(min_age, max_age)

print(df)

# Display invalid age records
invalid_ages = df[~df['Age_Valid']]
print("\nRecords with Invalid Age:")
print(invalid_ages)

      Name  Age  Age_Valid
0    Alice   25       True
1      Bob   -5      False
2  Charlie  130      False
3    David   45       True

Records with Invalid Age:
      Name  Age  Age_Valid
1      Bob   -5      False
2  Charlie  130      False


Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [16]:
import pandas as pd
import numpy as np

# Sample data with missing values
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "Age": [25, np.nan, 30, 45, np.nan],
    "Salary": [50000, 60000, np.nan, 80000, 70000]
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Step 1: Detect missing values
missing_counts = df.isnull().sum()
print("\nMissing values per column:")
print(missing_counts)

# Step 2: Impute missing numerical values using mean
# For simplicity, we impute only numeric columns
numeric_cols = df.select_dtypes(include='number').columns

for col in numeric_cols:
    mean_value = df[col].mean()
    df[col].fillna(mean_value, inplace=True)

print("\nData after mean imputation:")
print(df)

Original Data:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  60000.0
2  Charlie  30.0      NaN
3    David  45.0  80000.0
4      Eva   NaN  70000.0

Missing values per column:
Name      0
Age       2
Salary    1
dtype: int64

Data after mean imputation:
      Name        Age   Salary
0    Alice  25.000000  50000.0
1      Bob  33.333333  60000.0
2  Charlie  30.000000  65000.0
3    David  45.000000  80000.0
4      Eva  33.333333  70000.0


Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [17]:
import pandas as pd

# Sample data with duplicates
data = {
    "Name": ["Alice", "Bob", "Charlie", "Alice", "Bob"],
    "Age": [25, 30, 35, 25, 30],
    "Salary": [50000, 60000, 70000, 50000, 60000]
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Detect duplicate rows
duplicates = df.duplicated(keep=False)  # mark all duplicates as True

print("\nDuplicate Rows:")
print(df[duplicates])

Original Data:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000
3    Alice   25   50000
4      Bob   30   60000

Duplicate Rows:
    Name  Age  Salary
0  Alice   25   50000
1    Bob   30   60000
3  Alice   25   50000
4    Bob   30   60000


Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [18]:
import pandas as pd

# Sample data
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, -5, 130, 40],   # -5 and 130 are invalid ages
    "Salary": [50000, 60000, 70000, -1000]  # -1000 invalid salary
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Define valid ranges for numerical columns
valid_ranges = {
    "Age": (0, 120),
    "Salary": (0, None)  # None means no upper limit
}

# Check validity for each column
for col, (min_val, max_val) in valid_ranges.items():
    if min_val is not None:
        invalid_low = df[col] < min_val
    else:
        invalid_low = pd.Series([False] * len(df))
        
    if max_val is not None:
        invalid_high = df[col] > max_val
    else:
        invalid_high = pd.Series([False] * len(df))
        
    invalid = invalid_low | invalid_high
    
    print(f"\nInvalid values in column '{col}':")
    print(df[invalid])


Original Data:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   -5   60000
2  Charlie  130   70000
3    David   40   -1000

Invalid values in column 'Age':
      Name  Age  Salary
1      Bob   -5   60000
2  Charlie  130   70000

Invalid values in column 'Salary':
    Name  Age  Salary
3  David   40   -1000


Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [19]:
import pandas as pd

# Sample data with missing mandatory fields
data = {
    "Name": ["Alice", None, "Charlie", "David"],
    "Email": ["alice@example.com", "bob@example.com", None, "david@example.com"],
    "Phone": ["1234567890", None, "9876543210", ""],
    "Age": [25, 30, 35, 40]
}

df = pd.DataFrame(data)

# Define mandatory fields
mandatory_fields = ["Name", "Email", "Phone"]

# Create a boolean mask for rows violating completeness (missing mandatory fields)
violations_mask = df[mandatory_fields].isnull() | (df[mandatory_fields] == "")

# Identify rows where any mandatory field is missing or empty
rows_with_violations = violations_mask.any(axis=1)

# Filter rows that violate completeness rules
violation_report = df[rows_with_violations].copy()

# For clarity, add a summary column listing which mandatory fields are missing per row
def list_missing_fields(row):
    missing = []
    for field in mandatory_fields:
        if pd.isnull(row[field]) or row[field] == "":
            missing.append(field)
    return ", ".join(missing)

violation_report["Missing Fields"] = violation_report.apply(list_missing_fields, axis=1)

print("Custom Completeness Rule Violation Report:")
print(violation_report)

Custom Completeness Rule Violation Report:
      Name              Email       Phone  Age Missing Fields
1     None    bob@example.com        None   30    Name, Phone
2  Charlie               None  9876543210   35          Email
3    David  david@example.com               40          Phone


Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [20]:
import pandas as pd
import re

# Sample dataset with product codes
data = {
    "Product_Code": [
        "ABC-1234",    # valid
        "XYZ-5678-X",  # valid
        "DEF-0000-Y",  # valid
        "abc-1234",    # invalid (lowercase letters)
        "ABCD-1234",   # invalid (4 letters instead of 3)
        "XYZ-56789",   # invalid (5 digits instead of 4)
        "XYZ-5678-Z",  # invalid (suffix Z not allowed)
        "XY-5678",     # invalid (only 2 letters)
        "ABC1234"      # invalid (missing hyphen)
    ]
}

df = pd.DataFrame(data)

# Define advanced regex pattern
pattern = r'^[A-Z]{3}-\d{4}(-[XY])?$'

# Function to check validity using regex
def check_product_code_validity(code):
    if re.match(pattern, code):
        return True
    else:
        return False

# Apply validation
df["Is_Valid"] = df["Product_Code"].apply(check_product_code_validity)

print(df)

  Product_Code  Is_Valid
0     ABC-1234      True
1   XYZ-5678-X      True
2   DEF-0000-Y      True
3     abc-1234     False
4    ABCD-1234     False
5    XYZ-56789     False
6   XYZ-5678-Z     False
7      XY-5678     False
8      ABC1234     False
