**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [1]:
# Write your code from here
import pandas as pd

# Example: Load your dataset (replace 'your_dataset.csv' with your actual file)
# data = pd.read_csv('your_dataset.csv')

# For demonstration, let's create a sample dataframe with some nulls
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [25, None, 30, 22],
    'Salary': [50000, 60000, 55000, None]
})

# Check for null values in each column
null_counts = data.isnull().sum()

print("Null values count per column:")
print(null_counts)

# Check if dataset has any null values overall
if data.isnull().values.any():
    print("\nThe dataset contains missing values.")
else:
    print("\nThe dataset has no missing values.")


Null values count per column:
Name      1
Age       1
Salary    1
dtype: int64

The dataset contains missing values.


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [2]:
# Write your code from here
import pandas as pd

# Example dataset with mixed types
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 'Thirty', 30],   # 'Thirty' is invalid for age
    'Salary': [50000.0, 60000.0, 55000.0]
})

# Check current data types
print("Current data types:")
print(data.dtypes)

# Function to validate if a column can be converted to a specific type
def validate_column_type(df, column, expected_type):
    try:
        # Attempt conversion
        converted = df[column].astype(expected_type)
        print(f"Column '{column}' can be converted to {expected_type}.")
    except ValueError:
        print(f"Column '{column}' contains values that cannot be converted to {expected_type}.")

# Validate Age column for integer type
validate_column_type(data, 'Age', int)


Current data types:
Name       object
Age        object
Salary    float64
dtype: object
Column 'Age' contains values that cannot be converted to <class 'int'>.


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [3]:
# Write your code from here
import pandas as pd

# Sample data with emails
data = pd.DataFrame({
    'Email': ['alice@example.com', 'bob@example.com', 'alice@example.com', 'charlie@example.com'],
    'Name': ['Alice', 'Bob', 'Alice2', 'Charlie']
})

# Check for duplicates in the identifier column
duplicates = data['Email'].duplicated()

print("Duplicate rows based on 'Email':")
print(data[duplicates])

# Check if all identifiers are unique
if data['Email'].is_unique:
    print("All emails are unique.")
else:
    print("There are duplicate emails in the dataset.")


Duplicate rows based on 'Email':
               Email    Name
2  alice@example.com  Alice2
There are duplicate emails in the dataset.


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [4]:
# Write your code from here
import pandas as pd
import re

# Sample data
data = pd.DataFrame({
    'Email': ['alice@example.com', 'bob_at_example.com', 'charlie@example', 'david@example.org']
})

# Define email regex pattern
email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')

# Function to validate email format
def validate_email(email):
    if pd.isna(email):
        return False  # or True if you consider missing emails valid
    return bool(email_pattern.match(email))

# Apply validation function to 'Email' column
data['Valid_Email'] = data['Email'].apply(validate_email)

print(data)


                Email  Valid_Email
0   alice@example.com         True
1  bob_at_example.com        False
2     charlie@example        False
3   david@example.org         True


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [5]:
# Write your code from here
import pandas as pd

# Sample data
data = pd.DataFrame({
    'Age': [25, 34, -5, 150, 45, None, 80]
})

# Define a function to check if age is valid
def is_age_valid(age):
    if pd.isna(age):
        return False  # or True if you consider missing age as valid
    return 0 <= age <= 120

# Apply the function to the Age column
data['Age_Valid'] = data['Age'].apply(is_age_valid)

print(data)


     Age  Age_Valid
0   25.0       True
1   34.0       True
2   -5.0      False
3  150.0      False
4   45.0       True
5    NaN      False
6   80.0       True


Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [6]:
# Write your code from here
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

# Sample data with missing values
data = pd.DataFrame({
    'Age': [25, np.nan, 35, 40, np.nan, 60],
    'Salary': [50000, 60000, np.nan, 58000, 62000, np.nan]
})

print("Original Data with Missing Values:")
print(data)

# Identify missing values (True if missing)
missing_values = data.isnull()
print("\nMissing Values in Each Column:")
print(missing_values)

# Impute missing values using mean
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

print("\nData after Imputation:")
print(data_imputed)


Original Data with Missing Values:
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  35.0      NaN
3  40.0  58000.0
4   NaN  62000.0
5  60.0      NaN

Missing Values in Each Column:
     Age  Salary
0  False   False
1   True   False
2  False    True
3  False   False
4   True   False
5  False    True

Data after Imputation:
    Age   Salary
0  25.0  50000.0
1  40.0  60000.0
2  35.0  57500.0
3  40.0  58000.0
4  40.0  62000.0
5  60.0  57500.0


Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [7]:
# Write your code from here
import pandas as pd

# Sample dataset with duplicates
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Alice', 'David', 'Bob'],
    'Age': [25, 30, 25, 40, 30],
    'Salary': [50000, 60000, 50000, 70000, 60000]
})

print("Original Data:")
print(data)

# Detect duplicate rows (returns a boolean Series)
duplicates = data.duplicated()

print("\nDuplicate Rows Detected (True indicates duplicate):")
print(duplicates)

# Optionally, show the duplicate rows only
duplicate_rows = data[data.duplicated()]
print("\nDuplicate Rows:")
print(duplicate_rows)


Original Data:
    Name  Age  Salary
0  Alice   25   50000
1    Bob   30   60000
2  Alice   25   50000
3  David   40   70000
4    Bob   30   60000

Duplicate Rows Detected (True indicates duplicate):
0    False
1    False
2     True
3    False
4     True
dtype: bool

Duplicate Rows:
    Name  Age  Salary
2  Alice   25   50000
4    Bob   30   60000


Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [8]:
# Write your code from here
import pandas as pd

# Sample data
data = pd.DataFrame({
    'Age': [25, 30, -5, 150, 45],
    'Salary': [50000, 60000, 70000, 80000, -1000]
})

print("Original Data:")
print(data)

# Define valid ranges for columns
valid_ranges = {
    'Age': (0, 120),
    'Salary': (0, 200000)
}

# Function to check if values are within the range
def validate_column(df, col, min_val, max_val):
    invalid = ~df[col].between(min_val, max_val)
    if invalid.any():
        print(f"\nInvalid values detected in column '{col}':")
        print(df.loc[invalid, col])
    else:
        print(f"\nAll values in column '{col}' are within the valid range.")

# Validate each numerical column
for col, (min_val, max_val) in valid_ranges.items():
    validate_column(data, col, min_val, max_val)


Original Data:
   Age  Salary
0   25   50000
1   30   60000
2   -5   70000
3  150   80000
4   45   -1000

Invalid values detected in column 'Age':
2     -5
3    150
Name: Age, dtype: int64

Invalid values detected in column 'Salary':
4   -1000
Name: Salary, dtype: int64


Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [9]:
# Write your code from here
import pandas as pd

# Sample dataset
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', '', 'David', 'Eve'],
    'Email': ['alice@example.com', '', 'charlie@example.com', 'david@example.com', None],
    'Age': [25, 30, 22, None, 28],
    'Country': ['USA', 'USA', '', 'UK', 'Canada']
})

# Define mandatory fields
mandatory_fields = ['Name', 'Email', 'Age']

# Function to check completeness violations per row
def completeness_violations(df, mandatory_cols):
    # Create a boolean DataFrame where True indicates missing/empty values in mandatory fields
    violations = df[mandatory_cols].isnull() | (df[mandatory_cols] == '')  # check NaN or empty string
    
    # Any row with any violation?
    rows_with_violations = violations.any(axis=1)
    
    # Extract rows with violations
    violation_report = df.loc[rows_with_violations].copy()
    
    # Add columns specifying which mandatory fields are missing per row
    for col in mandatory_cols:
        violation_report[f'{col}_missing'] = violations[col]
    
    return violation_report

# Generate the report
report = completeness_violations(data, mandatory_fields)

print("Completeness Rule Violation Report:")
print(report)


Completeness Rule Violation Report:
    Name                Email   Age Country  Name_missing  Email_missing  \
1    Bob                       30.0     USA         False           True   
2         charlie@example.com  22.0                  True          False   
3  David    david@example.com   NaN      UK         False          False   
4    Eve                 None  28.0  Canada         False           True   

   Age_missing  
1        False  
2        False  
3         True  
4        False  


Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [10]:
# Write your code from here
import pandas as pd
import re

# Sample dataset
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Phone': [
        '+1-800-555-1234 ext.123',   # valid with country code and extension
        '(800) 555-5678',            # valid with parentheses area code
        '800.555.8765',              # valid with dots separator
        '123-45-6789'                # invalid format
    ]
})

# Advanced regex pattern for US phone numbers:
# Matches optional +1 country code, optional parentheses around area code,
# separators like space, dash, dot, and optional extensions like ext, x, or extension.
phone_pattern = re.compile(
    r'^(\+1[-\s.]?)?(\(?\d{3}\)?[-\s.]?)?\d{3}[-\s.]?\d{4}(\s*(ext|x|extension)\s*\d+)?$', 
    re.IGNORECASE
)

# Function to validate phone numbers using regex
def validate_phone(phone):
    if pd.isna(phone):
        return False
    return bool(phone_pattern.match(phone.strip()))

# Apply validation to the 'Phone' column
data['Phone_Valid'] = data['Phone'].apply(validate_phone)

print(data)


      Name                    Phone  Phone_Valid
0    Alice  +1-800-555-1234 ext.123        False
1      Bob           (800) 555-5678         True
2  Charlie             800.555.8765         True
3    David              123-45-6789        False
