### Task 1: Detecting Missing Values during Data Ingestion
**Description**: You have a CSV file with missing values in some columns. Write a Python script to detect and report missing values during the ingestion process.

**Steps**:
1. Load data
2. Check for missing values
3. Report missing values

In [9]:
# detect_missing_ingestion.py

import pandas as pd

# -----------------------------------------------------------------------------
# Step 0: Create a sample CSV file with missing values
# -----------------------------------------------------------------------------
sample_csv = """ID,Name,Age,Salary
1,John,28,50000
2,Jane,,60000
3,Bob,22,
4,Alice,30,58000
5,,25,52000
"""

with open('sample.csv', 'w') as f:
    f.write(sample_csv)

# -----------------------------------------------------------------------------
# Step 1 & 2: Load data and check for missing values
# -----------------------------------------------------------------------------
def load_and_check(file_path: str) -> pd.DataFrame:
    # Load the CSV into a DataFrame
    df = pd.read_csv(file_path)
    
    # Count missing values per column
    missing_counts = df.isnull().sum()
    total_missing = missing_counts.sum()
    
    # Step 3: Report (and error) if any missing values found
    if total_missing > 0:
        print("❗ Missing values detected during data ingestion:")
        for col, cnt in missing_counts.iteritems():
            if cnt > 0:
                print(f"  • Column '{col}': {cnt} missing value(s)")
        raise ValueError(f"Data ingestion halted: {total_missing} total missing value(s) found.")
    
    print("✅ No missing values found. Data ingestion successful.")
    return df

# -----------------------------------------------------------------------------
# Main entrypoint
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    try:
        df = load_and_check('sample.csv')
        # If no exception, proceed with further processing...
        # e.g. train model, save to database, etc.
    except Exception as e:
        print(f"Error: {e}")
        # Optionally, sys.exit(1) or other error handling here.


❗ Missing values detected during data ingestion:
Error: 'Series' object has no attribute 'iteritems'


### Task 2: Validate Data Types during Extraction
**Description**: You have a JSON file that should have specific data types for each field. Write a script to validate if the data types match the expected schema.

**Steps**:
1. Define expected schema
2. Validate data types

In [10]:
# validate_data_types.py

import json

# -----------------------------------------------------------------------------
# Step 0: Create a sample JSON file with intentional type mismatches
# -----------------------------------------------------------------------------
sample_data = [
    {"id": 1, "name": "Alice",   "age": 30,    "salary": 70000.0, "is_active": True},
    {"id": 2, "name": "Bob",     "age": "27",  "salary": 55000.5, "is_active": False},  # age as str
    {"id": 3, "name": 12345,     "age": 22,    "salary": "62000", "is_active": "yes"},  # name & salary & is_active wrong
    {"id": 4, "name": "Charlie", "age": 28,    "salary": 60000.0, "is_active": True}
]

with open('sample.json', 'w') as f:
    json.dump(sample_data, f, indent=2)

# -----------------------------------------------------------------------------
# Step 1: Define the expected schema
# -----------------------------------------------------------------------------
# Map field names to expected Python types
EXPECTED_SCHEMA = {
    "id": int,
    "name": str,
    "age": int,
    "salary": float,
    "is_active": bool
}

# -----------------------------------------------------------------------------
# Step 2: Load and validate
# -----------------------------------------------------------------------------
def load_and_validate(file_path: str, schema: dict):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    errors = []
    for idx, record in enumerate(data, start=1):
        for field, expected_type in schema.items():
            if field not in record:
                errors.append(f"Record {idx}: Missing field '{field}'.")
                continue
            
            value = record[field]
            # Special handling: allow ints where floats expected
            if expected_type is float and isinstance(value, int):
                # auto-coerce is fine, but we still note it
                continue
            if not isinstance(value, expected_type):
                errors.append(
                    f"Record {idx}, field '{field}': "
                    f"expected {expected_type.__name__}, got {type(value).__name__}"
                )
    if errors:
        print("❗ Data type validation errors detected:")
        for err in errors:
            print("  -", err)
        raise ValueError(f"{len(errors)} data type error(s) found. Extraction halted.")
    
    print("✅ All records match the expected schema!")
    return data

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    try:
        records = load_and_validate('sample.json', EXPECTED_SCHEMA)
        # proceed with downstream processing...
    except Exception as e:
        print(f"Error: {e}")
        # optionally exit with non-zero code
        # import sys; sys.exit(1)


❗ Data type validation errors detected:
  - Record 2, field 'age': expected int, got str
  - Record 3, field 'name': expected str, got int
  - Record 3, field 'salary': expected float, got str
  - Record 3, field 'is_active': expected bool, got str
Error: 4 data type error(s) found. Extraction halted.


### Task 3: Remove Duplicate Records in Data
**Description**: You have a dataset with duplicate entries. Write a Python script to find and remove duplicate records using Pandas.

**Steps**:
1. Find duplicate records
2. Remove duplicates
3. Report results

In [11]:
# remove_duplicates.py

import pandas as pd

# -----------------------------------------------------------------------------
# Step 0: Create a sample CSV file with duplicate records
# -----------------------------------------------------------------------------
sample_csv = """\
ID,Name,Age,Salary
1,John,28,50000
2,Jane,32,60000
3,Bob,22,45000
2,Jane,32,60000
4,Alice,30,58000
3,Bob,22,45000
5,Charlie,25,52000
"""

with open('sample_duplicates.csv', 'w') as f:
    f.write(sample_csv)

# -----------------------------------------------------------------------------
# Step 1: Load data and find duplicates
# -----------------------------------------------------------------------------
def load_and_find_duplicates(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    total_rows = len(df)
    
    # Identify duplicated rows (all columns)
    dup_mask = df.duplicated(keep=False)
    duplicates = df[dup_mask]
    num_duplicates = len(duplicates)
    
    print(f"Total rows loaded: {total_rows}")
    print(f"Number of duplicate rows (including repeats): {num_duplicates}")
    if num_duplicates > 0:
        print("\nDuplicate records:")
        print(duplicates.to_string(index=False))
    else:
        print("No duplicates found.")
    
    return df

# -----------------------------------------------------------------------------
# Step 2: Remove duplicates and report results
# -----------------------------------------------------------------------------
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    # By default, keep='first' keeps the first occurrence and drops later ones
    df_clean = df.drop_duplicates(keep='first').reset_index(drop=True)
    removed = len(df) - len(df_clean)
    
    print(f"\nDuplicates removed: {removed}")
    print(f"Rows after deduplication: {len(df_clean)}")
    return df_clean

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    # Load and report duplicates
    df = load_and_find_duplicates('sample_duplicates.csv')
    
    # Remove duplicates
    df_clean = remove_duplicates(df)
    
    # Optionally, save the cleaned DataFrame
    df_clean.to_csv('sample_duplicates_clean.csv', index=False)
    print("\nCleaned data saved to 'sample_duplicates_clean.csv'.")


Total rows loaded: 7
Number of duplicate rows (including repeats): 4

Duplicate records:
 ID Name  Age  Salary
  2 Jane   32   60000
  3  Bob   22   45000
  2 Jane   32   60000
  3  Bob   22   45000

Duplicates removed: 2
Rows after deduplication: 5

Cleaned data saved to 'sample_duplicates_clean.csv'.
