### Task 1: Detecting Missing Values during Data Ingestion
**Description**: You have a CSV file with missing values in some columns. Write a Python script to detect and report missing values during the ingestion process.

**Steps**:
1. Load data
2. Check for missing values
3. Report missing values

In [1]:
# Write your code from here
import pandas as pd

def detect_missing_values(csv_file_path):
    # Step 1: Load data
    df = pd.read_csv(csv_file_path)
    
    # Step 2: Check for missing values
    missing_counts = df.isnull().sum()
    total_rows = len(df)
    
    # Step 3: Report missing values
    print(f"Total rows in dataset: {total_rows}\n")
    print("Missing values per column:")
    for col, missing in missing_counts.items():
        percent = (missing / total_rows) * 100
        print(f" - {col}: {missing} missing ({percent:.2f}%)")
        
    # Optionally return the missing counts dictionary if needed
    return missing_counts

# Example usage:
if __name__ == "__main__":
    csv_path = "your_dataset.csv"  # Replace with your actual file path
    detect_missing_values(csv_path)


FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

### Task 2: Validate Data Types during Extraction
**Description**: You have a JSON file that should have specific data types for each field. Write a script to validate if the data types match the expected schema.

**Steps**:
1. Define expected schema
2. Validate data types

In [None]:
# Write your code from here
import json

def validate_data_types(json_file_path, expected_schema):
    # Load JSON data
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    errors = []
    
    # Validate each record in the data (assuming it's a list of dicts)
    for i, record in enumerate(data):
        for field, expected_type in expected_schema.items():
            if field not in record:
                errors.append(f"Record {i}: Missing field '{field}'")
            else:
                actual_value = record[field]
                # Check type - handle None separately
                if actual_value is None:
                    errors.append(f"Record {i}: Field '{field}' is None (expected {expected_type.__name__})")
                elif not isinstance(actual_value, expected_type):
                    errors.append(
                        f"Record {i}: Field '{field}' has type {type(actual_value).__name__} but expected {expected_type.__name__}"
                    )
    
    if errors:
        print("Data type validation errors found:")
        for error in errors:
            print(f" - {error}")
    else:
        print("All records conform to the expected data types.")

# Example usage
if __name__ == "__main__":
    # Define expected schema: field name -> expected Python type
    expected_schema = {
        "id": int,
        "name": str,
        "age": int,
        "email": str,
        "is_active": bool
    }

    json_path = "your_data.json"  # Replace with your JSON file path
    validate_data_types(json_path, expected_schema)


### Task 3: Remove Duplicate Records in Data
**Description**: You have a dataset with duplicate entries. Write a Python script to find and remove duplicate records using Pandas.

**Steps**:
1. Find duplicate records
2. Remove duplicates
3. Report results

In [None]:
# Write your code from here
import pandas as pd

def remove_duplicates(csv_file_path, subset_columns=None):
    """
    Removes duplicate records from a CSV file.

    Args:
        csv_file_path (str): Path to the CSV file.
        subset_columns (list or None): List of columns to consider for identifying duplicates.
                                       If None, considers all columns.

    Returns:
        pd.DataFrame: DataFrame without duplicates.
    """
    # Step 1: Load data
    df = pd.read_csv(csv_file_path)
    
    # Step 2: Find duplicate records
    if subset_columns:
        duplicates = df.duplicated(subset=subset_columns, keep=False)
    else:
        duplicates = df.duplicated(keep=False)
    
    duplicate_count = duplicates.sum()
    print(f"Total duplicate records found: {duplicate_count}")
    
    if duplicate_count > 0:
        print("\nDuplicate records:")
        print(df[duplicates])
    else:
        print("No duplicate records found.")
    
    # Step 3: Remove duplicates (keeping the first occurrence)
    if subset_columns:
        df_cleaned = df.drop_duplicates(subset=subset_columns, keep='first')
    else:
        df_cleaned = df.drop_duplicates(keep='first')
    
    removed_count = len(df) - len(df_cleaned)
    print(f"\nRemoved {removed_count} duplicate records. Remaining records: {len(df_cleaned)}")
    
    return df_cleaned

# Example usage:
if __name__ == "__main__":
    csv_path = "your_dataset.csv"  # Replace with your CSV file path
    # Specify columns to identify duplicates or None to consider all columns
    subset_cols = None
    
    cleaned_df = remove_duplicates(csv_path, subset_columns=subset_cols)
    
    # Optionally, save the cleaned dataset
    cleaned_df.to_csv("cleaned_dataset.csv", index=False)
    print("\nCleaned dataset saved to 'cleaned_dataset.csv'")
