**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [11]:
import pandas as pd

def check_null_values(df):
    """
    Returns columns with their respective count of null values.
    
    Parameters:
    - df: pandas DataFrame

    Returns:
    - Dictionary of column names with the number of nulls
    """
    null_counts = df.isnull().sum()
    return null_counts[null_counts > 0].to_dict()


# Example usage
data = {
    'name': ['Alice', 'Bob', None],
    'age': [25, None, 30],
    'email': ['a@example.com', 'b@example.com', None]
}
df = pd.DataFrame(data)
nulls = check_null_values(df)
print("Null Values:", nulls)

Null Values: {'name': 1, 'age': 1, 'email': 1}


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [12]:
# Write your code from here
def check_data_types(df, expected_types):
    """
    Checks if the columns in df match the expected data types.

    Parameters:
    - df: pandas DataFrame
    - expected_types: dictionary with column names and expected types (e.g., {'age': int})

    Returns:
    - Dictionary with columns that don't match the expected type
    """
    mismatches = {}

    for col, expected_type in expected_types.items():
        if col in df.columns:
            # Use apply to check type of each entry
            invalid_entries = df[~df[col].apply(lambda x: isinstance(x, expected_type))]
            if not invalid_entries.empty:
                mismatches[col] = len(invalid_entries)

    return mismatches


# Example usage
expected_types = {'age': int, 'name': str}
type_issues = check_data_types(df, expected_types)
print("Type Mismatches:", type_issues)

Type Mismatches: {'age': 3, 'name': 1}


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [13]:
# Write your code from here
import pandas as pd

def check_unique_identifiers(df, id_column):
    """
    Check if the values in the specified identifier column are unique.

    Parameters:
    - df: pandas DataFrame
    - id_column: name of the column to check for uniqueness

    Returns:
    - Boolean indicating if identifiers are unique
    - List of duplicate values (if any)
    """
    duplicates = df[df.duplicated(subset=id_column, keep=False)]
    return duplicates.empty, duplicates[id_column].tolist()


# Example usage
data = {
    'email': ['a@example.com', 'b@example.com', 'a@example.com'],
    'name': ['Alice', 'Bob', 'Alice']
}
df = pd.DataFrame(data)
is_unique, duplicates = check_unique_identifiers(df, 'email')
print("Are emails unique?", is_unique)
print("Duplicate Emails:", duplicates)

Are emails unique? False
Duplicate Emails: ['a@example.com', 'a@example.com']


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [14]:
# Write your code from here
import re

def validate_email_format(df, email_column):
    """
    Validates email format using regex.

    Parameters:
    - df: pandas DataFrame
    - email_column: column name containing email addresses

    Returns:
    - List of invalid emails
    """
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    
    invalid_emails = df[~df[email_column].astype(str).str.match(email_pattern)]
    return invalid_emails[email_column].tolist()


# Example usage
invalids = validate_email_format(df, 'email')
print("Invalid Emails:", invalids)

Invalid Emails: []


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [15]:
# Write your code from here
def check_logical_age_range(df, age_column, min_age=0, max_age=120):
    """
    Check if ages fall within a reasonable human range.

    Parameters:
    - df: pandas DataFrame
    - age_column: column name for age
    - min_age: minimum valid age
    - max_age: maximum valid age

    Returns:
    - List of invalid age entries
    """
    invalid_ages = df[(df[age_column] < min_age) | (df[age_column] > max_age)]
    return invalid_ages[age_column].tolist()


# Example usage
import pandas as pd

data = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 135, -5]
}
df = pd.DataFrame(data)

invalid_ages = check_logical_age_range(df, 'age')
print("Invalid Ages:", invalid_ages)

Invalid Ages: [135, -5]


Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [16]:

def impute_missing_with_mean(df, column_name):
    """
    Imputes missing values in a specified column using the mean.

    Parameters:
    - df: pandas DataFrame
    - column_name: name of the column to impute

    Returns:
    - Updated DataFrame with imputed values
    """
    if df[column_name].isnull().any():
        mean_value = df[column_name].mean()
        df[column_name].fillna(mean_value, inplace=True)
    return df


# Example usage
df_with_nan = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, None, 40]
})

print("Before Imputation:\n", df_with_nan)
df_imputed = impute_missing_with_mean(df_with_nan, 'age')
print("After Imputation:\n", df_imputed)

Before Imputation:
       name   age
0    Alice  25.0
1      Bob   NaN
2  Charlie  40.0
After Imputation:
       name   age
0    Alice  25.0
1      Bob  32.5
2  Charlie  40.0


Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [17]:
# Write your code from here
import pandas as pd

def detect_duplicates(df):
    """
    Detects duplicate rows in the DataFrame.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - DataFrame containing duplicate rows (if any)
    """
    duplicates = df[df.duplicated(keep=False)]
    return duplicates


# Example usage
data = {
    'name': ['Alice', 'Bob', 'Alice'],
    'age': [25, 30, 25]
}
df = pd.DataFrame(data)

duplicate_rows = detect_duplicates(df)
print("Duplicate Rows:\n", duplicate_rows)

Duplicate Rows:
     name  age
0  Alice   25
2  Alice   25


Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [18]:
# Write your code from here
def validate_numerical_range(df, column, min_value, max_value):
    """
    Checks if numerical values fall within a specified range.

    Parameters:
    - df: pandas DataFrame
    - column: column name to validate
    - min_value: minimum valid value
    - max_value: maximum valid value

    Returns:
    - List of out-of-range values
    """
    out_of_range = df[(df[column] < min_value) | (df[column] > max_value)]
    return out_of_range[column].tolist()


# Example usage
df = pd.DataFrame({
    'temperature': [22, 105, -3, 37]
})

invalid_values = validate_numerical_range(df, 'temperature', 0, 100)
print("Invalid Numerical Values:", invalid_values)

Invalid Numerical Values: [105, -3]


Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [19]:
# Write your code from here
import pandas as pd

def completeness_violation_report(df, mandatory_fields):
    """
    Generate a report of rows violating completeness rules (i.e., missing mandatory fields).

    Parameters:
    - df: pandas DataFrame
    - mandatory_fields: list of field names that must not be null or empty

    Returns:
    - DataFrame containing rows with missing mandatory field values
    """
    violations = df[df[mandatory_fields].isnull().any(axis=1)]
    return violations


# Example usage
data = {
    'name': ['Alice', 'Bob', None],
    'email': ['a@example.com', None, 'c@example.com'],
    'age': [25, 30, None]
}
df = pd.DataFrame(data)

mandatory_fields = ['name', 'email']
violation_df = completeness_violation_report(df, mandatory_fields)
print("Completeness Violations:\n", violation_df)

Completeness Violations:
    name          email   age
1   Bob           None  30.0
2  None  c@example.com   NaN


Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [20]:
# Write your code from here
import re

def validate_advanced_regex(df, column, pattern):
    """
    Validate entries in a column using an advanced regex pattern.

    Parameters:
    - df: pandas DataFrame
    - column: column to validate
    - pattern: regex pattern string

    Returns:
    - List of invalid values
    """
    invalid_entries = df[~df[column].astype(str).str.match(pattern, na=False)]
    return invalid_entries[column].tolist()


# Example usage: Validate strong password format (at least 1 uppercase, 1 lowercase, 1 number, 8+ characters)
df = pd.DataFrame({
    'password': ['abc123', 'Secure1Password', '12345678', 'weakPass', 'Strong@2024']
})

regex_pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$'
invalid_passwords = validate_advanced_regex(df, 'password', regex_pattern)
print("Invalid Passwords:", invalid_passwords)

Invalid Passwords: ['abc123', '12345678', 'weakPass']


In [25]:
import pandas as pd

# Create sample dataset
data = {
    'user_id': [101, 102, 103, 104, 105, 103],  # 103 is duplicated
    'name': ['Alice', 'Bob', 'Charlie', None, 'Eve', 'Charlie'],  # Missing name
    'age': [25, 17, 130, 45, None, 130],  # 130 is out of range, None is missing
    'email': ['alice@example.com', 'bob#example.com', 'charlie@mail.com',
              'dave@domain', 'eve@example.com', 'charlie@mail.com'],  # Invalid emails
    'password': ['Password123', 'pass', 'Secret12', '12345678', None, 'Secret12'],  # Invalid/missing passwords
    'income': [55000.0, 'not available', 72000.5, 48000.0, 51000.0, 72000.5]  # Data type issue
}

sample_df = pd.DataFrame(data)

# Show the DataFrame
print(sample_df)


   user_id     name    age              email     password         income
0      101    Alice   25.0  alice@example.com  Password123        55000.0
1      102      Bob   17.0    bob#example.com         pass  not available
2      103  Charlie  130.0   charlie@mail.com     Secret12        72000.5
3      104     None   45.0        dave@domain     12345678        48000.0
4      105      Eve    NaN    eve@example.com         None        51000.0
5      103  Charlie  130.0   charlie@mail.com     Secret12        72000.5


In [26]:
import pandas as pd
import re

class DataValidator:
    def __init__(self, df):
        if not isinstance(df, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")
        self.df = df

    def check_null_values(self):
        """Task 1: Returns dictionary of columns with count of null values."""
        null_counts = self.df.isnull().sum()
        return null_counts[null_counts > 0].to_dict()

    def check_data_types(self, expected_types):
        """Task 2: Returns columns that don't match expected data types."""
        mismatches = {}
        for col, expected_type in expected_types.items():
            if col in self.df.columns:
                invalid_entries = self.df[~self.df[col].apply(lambda x: isinstance(x, expected_type))]
                if not invalid_entries.empty:
                    mismatches[col] = len(invalid_entries)
        return mismatches

    def check_unique_identifiers(self, id_column):
        """Task 3: Returns (is_unique, list of duplicates) for an ID column."""
        if id_column not in self.df.columns:
            raise KeyError(f"'{id_column}' not found in DataFrame")
        duplicates = self.df[self.df.duplicated(subset=id_column, keep=False)]
        return duplicates.empty, duplicates[id_column].tolist()

    def validate_email_format(self, email_column):
        """Task 4: Returns list of invalid email addresses."""
        if email_column not in self.df.columns:
            raise KeyError(f"'{email_column}' not found in DataFrame")
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        invalids = self.df[~self.df[email_column].astype(str).str.match(email_pattern, na=False)]
        return invalids[email_column].tolist()

    def check_logical_age_range(self, age_column, min_age=0, max_age=120):
        """Task 5: Returns list of ages not in logical range (0–120)."""
        if age_column not in self.df.columns:
            raise KeyError(f"'{age_column}' not found in DataFrame")
        invalid = self.df[(self.df[age_column] < min_age) | (self.df[age_column] > max_age)]
        return invalid[age_column].tolist()

    def impute_missing_with_mean(self, column_name):
        """Task 6: Fills missing values in a numeric column with the mean."""
        if column_name not in self.df.columns:
            raise KeyError(f"'{column_name}' not found in DataFrame")
        if self.df[column_name].isnull().any():
            mean_val = self.df[column_name].mean()
            self.df[column_name].fillna(mean_val, inplace=True)
        return self.df

    def detect_duplicates(self):
        """Task 7: Returns DataFrame of duplicate rows."""
        return self.df[self.df.duplicated(keep=False)]

    def validate_numerical_range(self, column, min_value, max_value):
        """Task 8: Returns list of numerical values outside given range."""
        if column not in self.df.columns:
            raise KeyError(f"'{column}' not found in DataFrame")
        invalid = self.df[(self.df[column] < min_value) | (self.df[column] > max_value)]
        return invalid[column].tolist()

    def completeness_violation_report(self, mandatory_fields):
        """Task 9: Returns rows with nulls in any of the mandatory fields."""
        missing_fields = [f for f in mandatory_fields if f not in self.df.columns]
        if missing_fields:
            raise KeyError(f"Missing columns in DataFrame: {missing_fields}")
        return self.df[self.df[mandatory_fields].isnull().any(axis=1)]

    def validate_advanced_regex(self, column, pattern):
        """Task 10: Returns list of values not matching an advanced regex pattern."""
        if column not in self.df.columns:
            raise KeyError(f"'{column}' not found in DataFrame")
        invalid = self.df[~self.df[column].astype(str).str.match(pattern, na=False)]
        return invalid[column].tolist()

# =============================
# 📌 Example usage and testing
# =============================

if __name__ == "__main__":
    # Sample DataFrame
    data = {
        'name': ['Alice', 'Bob', None],
        'age': [25, None, 135],
        'email': ['a@example.com', 'b@example.com', 'bademail@'],
        'password': ['abc123', 'Secure1Password', '12345678']
    }
    df = pd.DataFrame(data)

    # Initialize validator
    validator = DataValidator(df)

    # Task-wise example outputs
    print("1. Null Values:", validator.check_null_values())
    print("2. Data Type Mismatches:", validator.check_data_types({'age': int, 'name': str}))
    print("3. Unique Emails Check:", validator.check_unique_identifiers('email'))
    print("4. Invalid Emails:", validator.validate_email_format('email'))
    print("5. Illogical Ages:", validator.check_logical_age_range('age'))
    print("6. Imputed Age Column:\n", validator.impute_missing_with_mean('age'))
    print("7. Duplicate Rows:\n", validator.detect_duplicates())
    print("8. Out-of-Range Ages:", validator.validate_numerical_range('age', 0, 120))
    print("9. Completeness Violations:\n", validator.completeness_violation_report(['name', 'email']))
    print("10. Invalid Passwords:", validator.validate_advanced_regex(
        'password', r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$'
    ))


1. Null Values: {'name': 1, 'age': 1}
2. Data Type Mismatches: {'age': 3, 'name': 1}
3. Unique Emails Check: (True, [])
4. Invalid Emails: ['bademail@']
5. Illogical Ages: [135.0]
6. Imputed Age Column:
     name    age          email         password
0  Alice   25.0  a@example.com           abc123
1    Bob   80.0  b@example.com  Secure1Password
2   None  135.0      bademail@         12345678
7. Duplicate Rows:
 Empty DataFrame
Columns: [name, age, email, password]
Index: []
8. Out-of-Range Ages: [135.0]
9. Completeness Violations:
    name    age      email  password
2  None  135.0  bademail@  12345678
10. Invalid Passwords: ['abc123', '12345678']
