### Task 1: Detecting Missing Values during Data Ingestion
**Description**: You have a CSV file with missing values in some columns. Write a Python script to detect and report missing values during the ingestion process.

**Steps**:
1. Load data
2. Check for missing values
3. Report missing values

In [None]:

import pandas as pd
import io
def detect_missing_values(csv_data):
    """
    Detects and reports missing values in a CSV file.
    Args:
        csv_data (str): The CSV data as a string.
    Returns:
        pd.DataFrame: A DataFrame containing the missing value counts for each column.
                          Returns None if the input data is empty or an error occurs.
    """
    if not csv_data:
        print("Error: Empty input data.")
        return None
    try:
        # Load the CSV data into a pandas DataFrame
        df = pd.read_csv(io.StringIO(csv_data))

        missing_values = df.isnull().sum()
        missing_values_df = pd.DataFrame(missing_values, columns=['Missing Count'])
        missing_values_df = missing_values_df[missing_values_df['Missing Count'] > 0] # only show columns with missing values
        if missing_values_df.empty:
            print("No missing values detected.")
            return None  # Return None to indicate no missing values
        return missing_values_df
    except Exception as e:
        print(f"Error processing CSV data: {e}")
        return None
def main():
    """
    Main function to execute the missing value detection.
    """
    csv_data = """
Name,Age,City,Salary
John,25,New York,50000
Jane,30,London,
Bob,,Paris,60000
Alice,28,,55000
"""
    missing_values_report = detect_missing_values(csv_data)
    if missing_values_report is not None:
        print("Missing Values Report:")
        print(missing_values_report)

if __name__ == "__main__":
    main()




### Task 2: Validate Data Types during Extraction
**Description**: You have a JSON file that should have specific data types for each field. Write a script to validate if the data types match the expected schema.

**Steps**:
1. Define expected schema
2. Validate data types

In [None]:

import json
def validate_data_types(json_data, schema):
    """
    Validates the data types of a JSON object against a provided schema.
    Args:
        json_data (str): The JSON data as a string.
        schema (dict): A dictionary representing the expected schema.
            Keys are field names and values are the expected data types (e.g., 'str', 'int', 'float', 'bool').
    Returns:
        dict: A dictionary containing the validation results.  Each key is a field name.
            Values are "valid" or "invalid".  Returns an empty dict on error.
    """
    try:
        data = json.loads(json_data)
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON: {e}")
        return {}
    results = {}
    for field, expected_type_str in schema.items():
        if field not in data:
            results[field] = "invalid"  # Field is missing
            continue
        value = data[field]
        expected_type = None
        if expected_type_str == 'str':
            expected_type = str
        elif expected_type_str == 'int':
            expected_type = int
        elif expected_type_str == 'float':
            expected_type = float
        elif expected_type_str == 'bool':
            expected_type = bool
        elif expected_type_str == 'list':
            expected_type = list
        elif expected_type_str == 'dict':
            expected_type = dict
        else:
            print(f"Warning: Unknown type '{expected_type_str}' in schema.  Skipping validation for field '{field}'.")
            results[field] = "invalid" #handles the error
            continue
        if not isinstance(value, expected_type):
            results[field] = "invalid"
        else:
            results[field] = "valid"
    return results
def main():
    """
    Main function to execute the data type validation.
    """
    json_data = """
    {
        "name": "John Doe",
        "age": 30,
        "salary": 75000.50,
        "is_active": true,
        "hobbies": ["reading", "traveling"],
        "address": {
            "street": "123 Main St",
            "city": "Anytown"
        }
    }
    """
    schema = {
        "name": "str",
        "age": "int",
        "salary": "float",
        "is_active": "bool",
        "hobbies": "list",
        "address": "dict"
    }
    results = validate_data_types(json_data, schema)

    print("Data Type Validation Results:")
    for field, result in results.items():
        print(f"{field}: {result}")
if __name__ == "__main__":
    main()


### Task 3: Remove Duplicate Records in Data
**Description**: You have a dataset with duplicate entries. Write a Python script to find and remove duplicate records using Pandas.

**Steps**:
1. Find duplicate records
2. Remove duplicates
3. Report results

In [None]:

import pandas as pd
import io
def remove_duplicate_records(csv_data):
    """
    Finds and removes duplicate records from CSV data using Pandas.
    Args:
        csv_data (str): The CSV data as a string.
    Returns:
        tuple: A tuple containing two DataFrames:
            - duplicate_df: DataFrame containing the duplicate records.
            - unique_df: DataFrame containing the unique records after removing duplicates.
              Returns (None, None) if the input data is empty or an error occurs.
    """
    if not csv_data:
        print("Error: Empty input data.")
        return None, None

    try:
        df = pd.read_csv(io.StringIO(csv_data))
        duplicate_df = df[df.duplicated()]
        unique_df = df.drop_duplicates()
        if duplicate_df.empty:
            print("No duplicate records found.")
        else:
            print("Duplicate records found:")
        return duplicate_df, unique_df

    except Exception as e:
        print(f"Error processing CSV data: {e}")
        return None, None
def main():
    """
    Main function to execute the duplicate removal process.
    """
    csv_data = """
Name,Age,City,Salary
John,25,New York,50000
Jane,30,London,60000
John,25,New York,50000
Bob,35,Paris,70000
Jane,30,London,60000
Alice,28,Berlin,55000
"""
    duplicate_df, unique_df = remove_duplicate_records(csv_data)
    if duplicate_df is not None and not duplicate_df.empty:
        print("Duplicate Records:")
        print(duplicate_df)
    if unique_df is not None:
        print("\nUnique Records:")
        print(unique_df)
if __name__ == "__main__":
    main()
