**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [1]:
import pandas as pd

def check_null_values(csv_file):
    """
    Checks for null values in a dataset and prints the count of null values for each column.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pandas.DataFrame: A DataFrame containing the count of null values for each column,
                          or None if an error occurs.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Get the count of null values for each column
    null_value_counts = df.isnull().sum()

    # If there are no null values, return a message
    if null_value_counts.sum() == 0:
        print("No null values found in the dataset.")
        return None

    # Create a DataFrame to display the null value counts
    null_value_df = pd.DataFrame({
        'Column Name': null_value_counts.index,
        'Null Count': null_value_counts.values
    })

    return null_value_df



def main():
    """
    Main function to run the null value check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'data_with_nulls.csv'  # Replace with your actual file path

    # Create a dummy CSV file with null values for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("col1,col2,col3\n1,4,7\n2,,8\n3,5,\n,6,9")
    except FileExistsError:
        pass


    # Check for null values
    null_value_df = check_null_values(csv_file)

    # Print the results
    if null_value_df is not None:
        print("Null Value Counts:")
        print(null_value_df.to_string(index=False))



if __name__ == "__main__":
    main()


Null Value Counts:
Column Name  Null Count
       col1           1
       col2           1
       col3           1


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [2]:
import pandas as pd

def check_data_types(csv_file, expected_types):
    """
    Checks if columns in a dataset contain data of the expected types.

    Args:
        csv_file (str): Path to the CSV file.
        expected_types (dict): A dictionary where keys are column names and values are
            the expected data types (e.g., {'age': int, 'name': str, 'salary': float}).

    Returns:
        dict: A dictionary where keys are column names, and values are either:
            - True: if all values in the column match the expected data type,
            - False: if any value in the column does not match the expected data type,
            - A string error message: If a column is not found.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    results = {}
    for column, expected_type in expected_types.items():
        if column not in df.columns:
            results[column] = f"Column '{column}' not found."
            continue  # Move to the next column

        # Get the actual data type of the column
        actual_type = df[column].dtype

        # Check for type match.  For numeric, allow both int and float
        if expected_type in (int, float) and pd.api.types.is_numeric_dtype(actual_type):
            results[column] = True
        elif actual_type == expected_type:
            results[column] = True
        else:
            results[column] = False

    return results



def main():
    """
    Main function to run the data type check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'data_types.csv'  # Replace with your actual file path

    # Define the expected data types for the columns
    expected_types = {
        'name': str,
        'age': int,
        'salary': float,
        'city': str,
        'is_active': bool
    }

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("name,age,salary,city,is_active\nAlice,30,50000.00,New York,True\nBob,25,60000.50,Los Angeles,False\nCharlie,35,75000,Chicago,True\nDavid,40,100000,Houston,False\nEve,22,45000,Miami,True\nFrank,28,55000,Dallas,True\n")
    except FileExistsError:
        pass

    # Check data types
    results = check_data_types(csv_file, expected_types)

    # Print the results
    if results is not None:
        print("Data Type Check Results:")
        for column, result in results.items():
            if isinstance(result, str):
                print(f"{column}: {result}")  # Print the error message
            else:
                print(f"{column}: {'Valid' if result else 'Invalid'}")



if __name__ == "__main__":
    main()


Data Type Check Results:
name: Invalid
age: Valid
salary: Valid
city: Invalid
is_active: Valid


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [3]:
import pandas as pd

def check_unique_identifiers(csv_file, id_column):
    """
    Checks if a specified column in a dataset has unique identifiers.

    Args:
        csv_file (str): Path to the CSV file.
        id_column (str): Name of the column to check for uniqueness (e.g., 'email', 'customer_id').

    Returns:
        bool: True if all values in the specified column are unique, False otherwise.
        pandas.Series: A Series containing the duplicate values, if any.  Returns an empty Series if no duplicates.
                        Returns None if there are errors
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None, None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None, None

    # Check if the specified column exists
    if id_column not in df.columns:
        print(f"Error: Column '{id_column}' not found in the file.")
        return None, None

    # Check for duplicates
    duplicates = df[id_column].duplicated(keep=False) #keep=False marks all duplicates as True
    has_duplicates = duplicates.any()
    duplicate_values = df[id_column][duplicates] # Extract the duplicate values

    return not has_duplicates, duplicate_values



def main():
    """
    Main function to run the unique identifier check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'customer_data.csv'  # Replace with your actual file path

    # Specify the column to check for uniqueness
    id_column = 'email'  # You can change this to 'customer_id' or any other identifier column

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("customer_id,name,email\n1,Alice,alice@example.com\n2,Bob,bob@example.com\n3,Charlie,charlie@example.com\n4,David,david@example.com\n5,Eve,alice@example.com\n6,Frank,bob@example.com") #Alice and Bob's email are duplicated
    except FileExistsError:
        pass

    # Check for unique identifiers
    is_unique, duplicate_values = check_unique_identifiers(csv_file, id_column)

    # Print the results
    if is_unique is not None:
        if is_unique:
            print(f"All values in column '{id_column}' are unique.")
        else:
            print(f"Column '{id_column}' contains duplicate values.")
            print("Duplicate values:")
            print(duplicate_values.to_string(index=False))
    else:
        print("An error occurred while checking for unique identifiers.")



if __name__ == "__main__":
    main()


Column 'email' contains duplicate values.
Duplicate values:
alice@example.com
  bob@example.com
alice@example.com
  bob@example.com


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [4]:
import pandas as pd
import re

def validate_email_format(csv_file, email_column):
    """
    Validates email addresses in a dataset using a regular expression pattern.

    Args:
        csv_file (str): Path to the CSV file.
        email_column (str): Name of the column containing email addresses.

    Returns:
        pandas.DataFrame: A DataFrame with the original email addresses and a boolean 'is_valid' column,
                          indicating whether each email is valid (True) or invalid (False).
                          Returns None if there are errors.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Check if the email column exists
    if email_column not in df.columns:
        print(f"Error: Email column '{email_column}' not found in the file.")
        return None

    # Basic regex pattern for email validation.
    # It checks for:
    # - One or more characters (excluding spaces and @) before the @ symbol
    # - An @ symbol
    # - One or more characters (excluding spaces) after the @ symbol
    # - A dot (.)
    # - Two or more characters (excluding spaces) after the dot
    email_pattern = r"^[^\s@]+@[^\s@]+\.[^\s@]{2,}$"

    # Apply the regex pattern to validate email addresses
    is_valid = df[email_column].str.match(email_pattern)

    # Create a new DataFrame with the original emails and the validation results
    results_df = pd.DataFrame({
        email_column: df[email_column],
        'is_valid': is_valid
    })

    return results_df



def main():
    """
    Main function to run the email format validation and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'customer_emails.csv'  # Replace with your actual file path

    # Specify the column containing email addresses
    email_column = 'email'

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("customer_id,name,email\n1,Alice,alice@example.com\n2,Bob,invalid-email\n3,Charlie,charlie@test.org\n4,David,missing@dotcom\n5,Eve,eve@sub.domain.com\n6,Frank,frank.last@domain.co.in\n")
    except FileExistsError:
        pass

    # Validate email formats
    results_df = validate_email_format(csv_file, email_column)

    # Print the results
    if results_df is not None:
        print("Email Format Validation Results:")
        print(results_df.to_string(index=False))



if __name__ == "__main__":
    main()


Email Format Validation Results:
                  email  is_valid
      alice@example.com      True
          invalid-email     False
       charlie@test.org      True
         missing@dotcom     False
     eve@sub.domain.com      True
frank.last@domain.co.in      True


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [5]:
import pandas as pd

def validate_age_range(csv_file, age_column, min_age=0, max_age=120):
    """
    Checks if ages in a dataset are within a specified range.

    Args:
        csv_file (str): Path to the CSV file.
        age_column (str): Name of the column containing ages.
        min_age (int, optional): The minimum valid age (inclusive). Defaults to 0.
        max_age (int, optional): The maximum valid age (inclusive). Defaults to 120.

    Returns:
        pandas.DataFrame: A DataFrame with the original ages and a boolean 'is_valid' column,
                          indicating whether each age is within the valid range (True) or not (False).
                          Returns None if there are errors.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Check if the age column exists
    if age_column not in df.columns:
        print(f"Error: Age column '{age_column}' not found in the file.")
        return None

    # Validate ages against the specified range
    is_valid = (df[age_column] >= min_age) & (df[age_column] <= max_age)

    # Create a new DataFrame with the original ages and the validation results
    results_df = pd.DataFrame({
        age_column: df[age_column],
        'is_valid': is_valid
    })

    return results_df



def main():
    """
    Main function to run the age validity check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'customer_ages.csv'  # Replace with your actual file path

    # Specify the column containing ages
    age_column = 'age'

    # Define the valid age range
    min_age = 0
    max_age = 120

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("customer_id,name,age\n1,Alice,30\n2,Bob,150\n3,Charlie,25\n4,David,-5\n5,Eve,80\n6,Frank,60\n")
    except FileExistsError:
        pass

    # Validate ages
    results_df = validate_age_range(csv_file, age_column, min_age, max_age)

    # Print the results
    if results_df is not None:
        print("Age Validation Results:")
        print(results_df.to_string(index=False))



if __name__ == "__main__":
    main()


Age Validation Results:
 age  is_valid
  30      True
 150     False
  25      True
  -5     False
  80      True
  60      True


Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [6]:
import pandas as pd
import numpy as np

def handle_missing_data(csv_file, strategy='mean', columns_to_impute=None):
    """
    Identifies missing values in a dataset and imputes them using a specified strategy.

    Args:
        csv_file (str): Path to the CSV file.
        strategy (str, optional): The imputation strategy.  Valid values are 'mean', 'median', or 'constant'.
            Defaults to 'mean'.
        columns_to_impute (list, optional): A list of column names to impute. If None, impute all numeric columns.
        constant_value (int or float, optional): The constant value to use for imputation with strategy='constant'.
            Required if strategy='constant'.

    Returns:
        pandas.DataFrame: A new DataFrame with missing values imputed, or None if an error occurs.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Handle the case where columns_to_impute is None
    if columns_to_impute is None:
        # Impute only numeric columns
        numeric_df = df.select_dtypes(include=np.number)
        columns_to_impute = numeric_df.columns.tolist()

    # Check if the strategy is valid
    if strategy not in ['mean', 'median', 'constant']:
        print(f"Error: Invalid imputation strategy '{strategy}'.  Must be 'mean', 'median', or 'constant'.")
        return None

    # Check if columns_to_impute are in the DataFrame
    for col in columns_to_impute:
        if col not in df.columns:
            print(f"Error: Column '{col}' not found in the file.")
            return None

    # Impute the missing values
    df_imputed = df.copy()  # Create a copy to avoid modifying the original DataFrame
    for col in columns_to_impute:
        if strategy == 'mean':
            df_imputed[col] = df_imputed[col].fillna(df_imputed[col].mean())
        elif strategy == 'median':
            df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())
        elif strategy == 'constant':
            if 'constant_value' not in locals():
                print("Error: constant_value must be provided when strategy is 'constant'.")
                return None
            df_imputed[col] = df_imputed[col].fillna(constant_value)

    return df_imputed



def main():
    """
    Main function to run the missing data imputation and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'data_with_missing.csv'  # Replace with your actual file path

    # Create a dummy CSV file with missing values for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("col1,col2,col3,col4\n1,4,7,a\n2,,8,b\n3,5,,c\n,6,9,d\n5,,10,e")
    except FileExistsError:
        pass

    # Impute missing values using the mean for col1, col2 and col3
    imputed_df = handle_missing_data(csv_file, strategy='mean', columns_to_impute=['col1', 'col2', 'col3'])

    # Print the results
    if imputed_df is not None:
        print("DataFrame with Imputed Values (Mean Imputation for col1, col2, col3):")
        print(imputed_df)

    # Impute missing values of col3 using median
    imputed_df = handle_missing_data(csv_file, strategy='median', columns_to_impute=['col3'])
    if imputed_df is not None:
        print("\nDataFrame with Imputed Values (Median Imputation for col3):")
        print(imputed_df)

    # Impute missing values of col1 and col2 using a constant value
    imputed_df = handle_missing_data(csv_file, strategy='constant', columns_to_impute=['col1', 'col2'], constant_value=0)
    if imputed_df is not None:
        print("\nDataFrame with Imputed Values (Constant Imputation (0) for col1 and col2):")
        print(imputed_df)



if __name__ == "__main__":
    main()


DataFrame with Imputed Values (Mean Imputation for col1, col2, col3):
   col1  col2  col3 col4
0  1.00   4.0   7.0    a
1  2.00   5.0   8.0    b
2  3.00   5.0   8.5    c
3  2.75   6.0   9.0    d
4  5.00   5.0  10.0    e

DataFrame with Imputed Values (Median Imputation for col3):
   col1  col2  col3 col4
0   1.0   4.0   7.0    a
1   2.0   NaN   8.0    b
2   3.0   5.0   8.5    c
3   NaN   6.0   9.0    d
4   5.0   NaN  10.0    e


TypeError: handle_missing_data() got an unexpected keyword argument 'constant_value'

Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [7]:
import pandas as pd

def detect_duplicate_rows(csv_file):
    """
    Detects duplicate rows in a dataset.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pandas.DataFrame: A DataFrame containing all duplicate rows, or an empty DataFrame if no duplicates are found.
                          Returns None if an error occurs.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Detect duplicate rows
    duplicate_rows = df[df.duplicated(keep='first')]  # Get all duplicate rows, keeping the first occurrence

    return duplicate_rows



def main():
    """
    Main function to run the duplicate row detection and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'data_with_duplicates.csv'  # Replace with your actual file path

    # Create a dummy CSV file with duplicate rows for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("col1,col2,col3\n1,4,7\n2,5,8\n1,4,7\n3,6,9\n2,5,8\n4,7,10\n")
    except FileExistsError:
        pass

    # Detect duplicate rows
    duplicate_rows_df = detect_duplicate_rows(csv_file)

    # Print the results
    if duplicate_rows_df is not None:
        if duplicate_rows_df.empty:
            print("No duplicate rows found in the dataset.")
        else:
            print("Duplicate rows found:")
            print(duplicate_rows_df.to_string(index=False))



if __name__ == "__main__":
    main()


Duplicate rows found:
 col1  col2  col3
    1     4     7
    2     5     8


Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [8]:
import pandas as pd

def validate_numerical_range(csv_file, column_ranges):
    """
    Validates that numerical values in specified columns of a dataset are within defined ranges.

    Args:
        csv_file (str): Path to the CSV file.
        column_ranges (dict): A dictionary where keys are column names and values are tuples
            representing the minimum and maximum allowed values (inclusive) for that column.
            For example: {'age': (0, 120), 'salary': (0, 1000000)}.

    Returns:
        dict: A dictionary where keys are column names, and values are DataFrames.
            Each DataFrame contains the rows where the values in that column are outside the
            specified range.  If a column is not found or has no values outside the range,
            the column will not be included in the returned dictionary.
            Returns None if there are errors.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Check for invalid input in column_ranges
    for column, value_range in column_ranges.items():
        if not isinstance(value_range, tuple) or len(value_range) != 2:
            print(f"Error: Invalid range format for column '{column}'.  Must be a tuple of (min, max).")
            return None
        if not (isinstance(value_range[0], (int, float)) and isinstance(value_range[1], (int, float))):
            print(f"Error: Invalid range values for column '{column}'.  Min and max must be numbers.")
            return None

    results = {}
    for column, (min_val, max_val) in column_ranges.items():
        if column not in df.columns:
            print(f"Error: Column '{column}' not found in the file.")
            continue  # Skip to the next column
        if not pd.api.types.is_numeric_dtype(df[column]):
            print(f"Warning: Column '{column}' is not numeric and will be skipped.")
            continue

        # Find values outside the range
        out_of_range_rows = df[(df[column] < min_val) | (df[column] > max_val)]
        if not out_of_range_rows.empty:
            results[column] = out_of_range_rows

    return results



def main():
    """
    Main function to run the numerical value range check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'data_with_numerical_outliers.csv'  # Replace with your actual file path

    # Define the valid ranges for numerical columns
    column_ranges = {
        'age': (0, 120),
        'salary': (0, 100000),
        'height': (100, 250),  # Example in cm
        'grade': (0, 100),
        'price': (0, None) #Example with only a minimum
    }

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("name,age,salary,height,grade,price\nAlice,30,50000,150,80,200\nBob,150,60000,180,90,1000\nCharlie,25,75000,160,70,50\nDavid,-5,100000,170,105,0\nEve,80,200000,165,60,-10\nFrank,60,55000,260,85,\n")
    except FileExistsError:
        pass

    # Validate numerical ranges
    results = validate_numerical_range(csv_file, column_ranges)

    # Print the results
    if results is not None:
        if not results:
            print("No numerical values found outside the specified ranges.")
        else:
            print("Numerical values found outside the specified ranges:")
            for column, out_of_range_df in results.items():
                print(f"\nColumn: {column}")
                print(out_of_range_df.to_string(index=False))



if __name__ == "__main__":
    main()


Error: Invalid range values for column 'price'.  Min and max must be numbers.


Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [9]:
import pandas as pd

def check_completeness_rules(csv_file, completeness_rules):
    """
    Checks which rows in a dataset violate specified completeness rules.

    Args:
        csv_file (str): Path to the CSV file.
        completeness_rules (dict): A dictionary where keys are column names and values are
            booleans indicating whether the column is mandatory (True) or optional (False).
            For example: {'customer_id': True, 'name': True, 'email': True, 'phone': False}.

    Returns:
        pandas.DataFrame: A DataFrame containing the rows that violate the completeness rules,
                          or an empty DataFrame if no violations are found.
                          Returns None if there are errors.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Check for invalid input in completeness_rules
    for column, is_mandatory in completeness_rules.items():
        if not isinstance(is_mandatory, bool):
            print(f"Error: Invalid value type for column '{column}'.  Must be a boolean.")
            return None

    # Check if all mandatory columns exist
    for column, is_mandatory in completeness_rules.items():
        if is_mandatory and column not in df.columns:
            print(f"Error: Mandatory column '{column}' not found in the file.")
            return None

    # Identify rows that violate completeness rules
    violations = pd.DataFrame()  # Start with an empty DataFrame
    for column, is_mandatory in completeness_rules.items():
        if is_mandatory:
            missing_values = df[column].isnull()
            if missing_values.any():  # Check if there are any True values (missing values)
                violations_df = df[missing_values] # get the rows where there are missing values
                violations = pd.concat([violations, violations_df], ignore_index=True)

    return violations



def main():
    """
    Main function to run the completeness rule violation check and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'customer_data_completeness.csv'  # Replace with your actual file path

    # Define the completeness rules
    completeness_rules = {
        'customer_id': True,  # Mandatory
        'name': True,        # Mandatory
        'email': True,       # Mandatory
        'phone': False,      # Optional
        'address': False,    # Optional
        'city': True
    }

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("customer_id,name,email,phone,address,city\n1,Alice,alice@example.com,555-1234,123 Main St,New York\n2,Bob,,555-5678,,Los Angeles\n3,Charlie,charlie@example.com,,789 Pine Ln,\n4,David,david@example.com,555-9012,222 Elm St,Houston\n5,Eve,eve@example.com,555-2345,333 Oak Ave,Miami")
    except FileExistsError:
        pass

    # Check for completeness rule violations
    violations_df = check_completeness_rules(csv_file, completeness_rules)

    # Print the results
    if violations_df is not None:
        if violations_df.empty:
            print("No completeness rule violations found.")
        else:
            print("Completeness rule violations found:")
            print(violations_df.to_string(index=False))



if __name__ == "__main__":
    main()


Completeness rule violations found:
 customer_id    name               email    phone     address        city
           2     Bob                 NaN 555-5678         NaN Los Angeles
           3 Charlie charlie@example.com      NaN 789 Pine Ln         NaN


Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [10]:
import pandas as pd
import re

def validate_data_with_regex(csv_file, validation_rules):
    """
    Validates data in a dataset using advanced regular expression patterns.

    Args:
        csv_file (str): Path to the CSV file.
        validation_rules (dict): A dictionary where keys are column names and values are
            dictionaries containing regex patterns and descriptions for validation.
            For example:
            {
                'order_id': {'pattern': r'^[A-Z]{3}-\d{3}$', 'description': 'e.g., ABC-123'},
                'email': {'pattern': r"^[^\s@]+@[^\s@]+\.[^\s@]{2,}$", 'description': 'Standard email format'},
                'phone': {
                    'pattern': r"^(?:\+?\d{1,3}[- ]?)?(?:\(\d{3}\)[- ]?)?\d{3}[- ]?\d{4}$",
                    'description': 'North American phone format'
                },
                'date': {
                    'pattern': r"^\d{4}-\d{2}-\d{2}$",
                    'description': 'YYYY-MM-DD'
                }
            }.

    Returns:
        dict: A dictionary where keys are column names from the validation_rules.
              Values are dictionaries with keys 'valid' and 'invalid'.
              'valid' contains the valid values from that column, and 'invalid' contains the invalid values.
              Returns None if there are errors.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    results = {}
    for column, rule in validation_rules.items():
        if column not in df.columns:
            print(f"Error: Column '{column}' not found in the file.")
            return None

        pattern = rule.get('pattern')
        description = rule.get('description', 'No description provided')  # Default description

        if not pattern:
            print(f"Error: No regex pattern provided for column '{column}'.")
            return None

        try:
            # Apply the regex pattern to validate the column
            valid_values = df[column][df[column].str.match(pattern, na=False)]  # na=False: important
            invalid_values = df[column][~df[column].str.match(pattern, na=False)]
            results[column] = {'valid': valid_values.tolist(), 'invalid': invalid_values.tolist()}
        except re.error as e:
            print(f"Error: Invalid regular expression for column '{column}': {e}")
            return None

    return results



def main():
    """
    Main function to run the advanced regex data validation and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'data_for_validation.csv'  # Replace with your actual file path

    # Define the validation rules with advanced regex patterns
    validation_rules = {
        'order_id': {
            'pattern': r'^[A-Z]{3}-\d{3}$',
            'description': 'Order ID format: Three uppercase letters followed by a hyphen and three digits (e.g., ABC-123)'
        },
        'email': {
            'pattern': r"^[^\s@]+@[^\s@]+\.[^\s@]{2,}$",
            'description': 'Standard email format (e.g., user@example.com)'
        },
        'phone': {
            'pattern': r"^(?:\+?\d{1,3}[- ]?)?(?:\(\d{3}\)[- ]?)?\d{3}[- ]?\d{4}(?:(?: ext\.)?\s*\d+)?$",
            'description': 'North American phone format (e.g., 123-456-7890, (123) 456-7890, +1-123-456-7890 ext. 123)'
        },
        'date': {
            'pattern': r"^\d{4}-\d{2}-\d{2}$",
            'description': 'Date in YYYY-MM-DD format (e.g., 2023-10-27)'
        },
        'product_code': {
            'pattern': r"^[A-Z]{2}-\d{4}-(?:[A-Z]{1,2}|\d{1,3})$",
            'description': 'Product code: Two uppercase letters, hyphen, four digits, hyphen, and either one or two letters OR one to three digits (e.g., AB-1234-X, AB-1234-123)'
        }
    }

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("order_id,email,phone,date,product_code\nABC-123,user@example.com,123-456-7890,2023-10-26,AB-1234-XY\nXYZ-456,invalid-email,(123) 456-7890,2023/10/26,AB-1234-12\nDEF-789,another@test.org,+1-123-456-7890 ext. 123,26.10.2023,AB-1234-123\nGHI-001,missing@dotcom,123.456.7890,2023-10-28,A-1234-X\nJKL-a23,user@sub.domain.com,1234567890,2023-10-29,AB-123-12\nMNO-987,first.last@domain.co.in,invalid-phone,2023-10-30,AB-1234-XYZ\n")
    except FileExistsError:
        pass

    # Validate data using advanced regex patterns
    results = validate_data_with_regex(csv_file, validation_rules)

    # Print the results
    if results is not None:
        for column, values in results.items():
            print(f"\nColumn: {column} ({validation_rules[column]['description']})")
            print(f"  Valid values: {values['valid']}")
            print(f"  Invalid values: {values['invalid']}")



if __name__ == "__main__":
    main()



Column: order_id (Order ID format: Three uppercase letters followed by a hyphen and three digits (e.g., ABC-123))
  Valid values: ['ABC-123', 'XYZ-456', 'DEF-789', 'GHI-001', 'MNO-987']
  Invalid values: ['JKL-a23']

Column: email (Standard email format (e.g., user@example.com))
  Valid values: ['user@example.com', 'another@test.org', 'user@sub.domain.com', 'first.last@domain.co.in']
  Invalid values: ['invalid-email', 'missing@dotcom']

Column: phone (North American phone format (e.g., 123-456-7890, (123) 456-7890, +1-123-456-7890 ext. 123))
  Valid values: ['123-456-7890', '(123) 456-7890', '1234567890']
  Invalid values: ['+1-123-456-7890 ext. 123', '123.456.7890', 'invalid-phone']

Column: date (Date in YYYY-MM-DD format (e.g., 2023-10-27))
  Valid values: ['2023-10-26', '2023-10-28', '2023-10-29', '2023-10-30']
  Invalid values: ['2023/10/26', '26.10.2023']

Column: product_code (Product code: Two uppercase letters, hyphen, four digits, hyphen, and either one or two letters OR on