Create a function that takes a Pandas DataFrame as input. This function must identify
columns with incorrect data types (e.g., numbers stored as strings) and apply type
conversion. Furthermore, use conditional statements and looping to validate data
entries against a set of business rules (e.g., age must be positive) and flag the invalid
records.


In [1]:
import pandas as pd
import numpy as np

def process_and_validate_dataframe(df):
    """
    Processes a DataFrame by correcting data types and validating
    data against business rules using loops and conditionals.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A new DataFrame with processed types and
                      new 'validation_errors' and 'is_valid' columns.
    """
    # --- 1. Type Conversion ---
    # Create a copy to avoid modifying the original DataFrame
    df_processed = df.copy()

    # Define columns that we expect to be numeric
    # In a real-world scenario, this list might be an argument to the function.
    potential_numeric_cols = ['Age', 'Salary', 'YearsOfService']

    print("--- Starting Type Conversion ---")
    for col in potential_numeric_cols:
        # Check if the column exists in the DataFrame
        if col in df_processed.columns:
            print(f"Attempting to convert column '{col}' to numeric...")
            # pd.to_numeric is the key.
            # errors='coerce' will turn any un-convertible values
            # (e.g., "forty", "N/A") into NaN (Not a Number).
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
        else:
            print(f"Skipping conversion: Column '{col}' not found.")
    print("--- Type Conversion Complete ---")


    # --- 2. Data Validation (using looping and conditionals) ---
    
    # This list will hold the list of errors for *each row*
    validation_errors_list = []
    
    # The prompt specifically requests looping. .iterrows() is one way to do this.
    print("\n--- Starting Row-by-Row Validation ---")
    for index, row in df_processed.iterrows():
        # This list will hold errors for the *current row*
        current_row_errors = []

        # --- Business Rule 1: Validate 'Age' ---
        # Use conditional 'if/elif/else' statements
        if pd.isna(row['Age']):
            current_row_errors.append("Age is missing or non-numeric")
        elif row['Age'] <= 0:
            current_row_errors.append(f"Age must be positive (Got: {row['Age']})")
        elif row['Age'] < 18:
             current_row_errors.append(f"Employee must be 18 or older (Got: {row['Age']})")

        # --- Business Rule 2: Validate 'Salary' ---
        if pd.isna(row['Salary']):
            current_row_errors.append("Salary is missing or non-numeric")
        elif row['Salary'] < 0:
            current_row_errors.append(f"Salary cannot be negative (Got: {row['Salary']})")

        # --- Business Rule 3: Validate 'Department' ---
        valid_departments = ['HR', 'Engineering', 'Sales', 'Marketing']
        if row['Department'] not in valid_departments:
            current_row_errors.append(f"Invalid Department: '{row['Department']}'")
            
        # --- Business Rule 4: Validate 'EmployeeID' ---
        if pd.isna(row['EmployeeID']) or row['EmployeeID'] == '':
             current_row_errors.append("EmployeeID cannot be missing")

        # Add the list of errors for this row to our main list
        validation_errors_list.append(current_row_errors)
    
    print("--- Validation Complete ---")

    # --- 3. Flagging Invalid Records ---
    # Add the error lists as a new column
    df_processed['validation_errors'] = validation_errors_list
    
    # Add a simple boolean flag for easy filtering
    # A row is valid if its list of errors is empty (length == 0)
    df_processed['is_valid'] = df_processed['validation_errors'].apply(lambda x: len(x) == 0)

    return df_processed

# --- Main execution block to demonstrate the function ---
if __name__ == "__main__":
    
    # 1. Create Sample "Dirty" Data
    # This data includes numbers as strings, non-numeric strings,
    # negative values, and invalid categories.
    data = {
        'EmployeeID': ['A101', 'A102', 'A103', 'A104', 'A105', 'A106', 'A107', 'A108'],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi'],
        'Age': ['28', '34', '-5', 'forty', '42', '16', '30', '25'],
        'Department': ['HR', 'Engineering', 'Sales', 'IT', 'Marketing', 'HR', 'Sales', 'Engineering'],
        'Salary': ['60000', '120000', '75000', '90000', '-1000', '52000', 'not a salary', '110000']
    }
    
    df_raw = pd.DataFrame(data)

    print("====================================")
    print("--- 1. Original DataFrame ---")
    print(df_raw)
    print("\n--- Original DataTypes ---")
    print(df_raw.dtypes)
    print("====================================")

    # 2. Process and Validate the DataFrame
    df_clean = process_and_validate_dataframe(df_raw)

    print("\n====================================")
    print("--- 2. Processed DataFrame ---")
    # Set display options to show all columns and cell content
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)
    print(df_clean)
    print("\n--- Processed DataTypes ---")
    print(df_clean.dtypes)
    print("====================================")

    # 3. Show only the invalid records
    print("\n====================================")
    print("--- 3. Invalid Records Only ---")
    invalid_records = df_clean[df_clean['is_valid'] == False]
    print(invalid_records[['Name', 'Age', 'Salary', 'Department', 'validation_errors']])
    print("====================================")

--- 1. Original DataFrame ---
  EmployeeID     Name    Age   Department        Salary
0       A101    Alice     28           HR         60000
1       A102      Bob     34  Engineering        120000
2       A103  Charlie     -5        Sales         75000
3       A104    David  forty           IT         90000
4       A105      Eve     42    Marketing         -1000
5       A106    Frank     16           HR         52000
6       A107    Grace     30        Sales  not a salary
7       A108    Heidi     25  Engineering        110000

--- Original DataTypes ---
EmployeeID    object
Name          object
Age           object
Department    object
Salary        object
dtype: object
--- Starting Type Conversion ---
Attempting to convert column 'Age' to numeric...
Attempting to convert column 'Salary' to numeric...
Skipping conversion: Column 'YearsOfService' not found.
--- Type Conversion Complete ---

--- Starting Row-by-Row Validation ---
--- Validation Complete ---

--- 2. Processed DataFrame 