### Task 1: Measure Data Accuracy using a Trusted Source

**Description**: You have two datasets of product prices: `company_prices.csv` and
`trusted_prices.csv` . Check if the prices in `company_prices.csv` match the prices in
`trusted_prices.csv` . Assume both files have a "product_id" and "price" column.

In [1]:
import pandas as pd

def measure_price_accuracy(company_prices_path, trusted_prices_path):
    """
    Measures the accuracy of product prices in company_prices.csv against
    trusted_prices.csv.

    Args:
        company_prices_path (str): The file path to the company's product prices CSV.
        trusted_prices_path (str): The file path to the trusted product prices CSV.

    Returns:
        dict: A dictionary containing:
              - 'total_products': Total number of unique products found in both files.
              - 'matching_prices': Number of products where prices match.
              - 'mismatched_prices': Number of products where prices do not match.
              - 'mismatched_details': A DataFrame showing product_id, company_price, trusted_price
                                      for all mismatched prices.
              - 'accuracy_percentage': The percentage of prices that match.
              Returns None if file reading fails.
    """
    try:
        company_df = pd.read_csv(company_prices_path)
        trusted_df = pd.read_csv(trusted_prices_path)
    except FileNotFoundError as e:
        print(f"Error: One of the files was not found: {e}")
        return None
    except pd.errors.EmptyDataError as e:
        print(f"Error: One of the CSV files is empty: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while reading files: {e}")
        return None

    # Ensure 'product_id' and 'price' columns exist in both DataFrames
    required_columns = ['product_id', 'price']
    if not all(col in company_df.columns for col in required_columns):
        print(f"Error: '{company_prices_path}' must contain 'product_id' and 'price' columns.")
        return None
    if not all(col in trusted_df.columns for col in required_columns):
        print(f"Error: '{trusted_prices_path}' must contain 'product_id' and 'price' columns.")
        return None

    # Merge the two DataFrames on 'product_id' to compare prices
    # Use an inner join to only compare products present in both datasets
    merged_df = pd.merge(company_df, trusted_df, on='product_id', suffixes=('_company', '_trusted'))

    if merged_df.empty:
        print("No common products found between the two datasets for comparison.")
        return {
            'total_products_compared': 0,
            'matching_prices': 0,
            'mismatched_prices': 0,
            'mismatched_details': pd.DataFrame(columns=['product_id', 'price_company', 'price_trusted']),
            'accuracy_percentage': 100.0 if company_df.empty and trusted_df.empty else 0.0
        }

    # Identify matching and mismatched prices
    merged_df['price_matches'] = (merged_df['price_company'] == merged_df['price_trusted'])

    matching_prices_count = merged_df['price_matches'].sum()
    mismatched_prices_count = len(merged_df) - matching_prices_count
    total_products_compared = len(merged_df)

    accuracy_percentage = (matching_prices_count / total_products_compared) * 100 if total_products_compared > 0 else 0.0

    # Get details of mismatched prices
    mismatched_details_df = merged_df[~merged_df['price_matches']][['product_id', 'price_company', 'price_trusted']]

    return {
        'total_products_compared': total_products_compared,
        'matching_prices': matching_prices_count,
        'mismatched_prices': mismatched_prices_count,
        'mismatched_details': mismatched_details_df,
        'accuracy_percentage': accuracy_percentage
    }

# --- Example Usage (requires creating dummy CSV files) ---
if __name__ == "__main__":
    # Create dummy CSV files for demonstration
    # You would replace these with your actual file paths
    company_data = {
        'product_id': ['A001', 'A002', 'A003', 'A004', 'A005'],
        'price': [10.50, 20.00, 30.25, 40.00, 50.00]
    }
    trusted_data = {
        'product_id': ['A001', 'A002', 'A003', 'A004', 'A006'], # A006 not in company, A005 not in trusted
        'price': [10.50, 21.00, 30.25, 40.50, 60.00]
    }

    df_company = pd.DataFrame(company_data)
    df_trusted = pd.DataFrame(trusted_data)

    company_file = 'company_prices.csv'
    trusted_file = 'trusted_prices.csv'

    df_company.to_csv(company_file, index=False)
    df_trusted.to_csv(trusted_file, index=False)

    print(f"Created '{company_file}':\n{df_company}\n")
    print(f"Created '{trusted_file}':\n{df_trusted}\n")

    # Measure accuracy
    accuracy_results = measure_price_accuracy(company_file, trusted_file)

    if accuracy_results:
        print("\n--- Price Accuracy Results ---")
        print(f"Total Products Compared: {accuracy_results['total_products_compared']}")
        print(f"Matching Prices: {accuracy_results['matching_prices']}")
        print(f"Mismatched Prices: {accuracy_results['mismatched_prices']}")
        print(f"Accuracy Percentage: {accuracy_results['accuracy_percentage']:.2f}%")

        if not accuracy_results['mismatched_details'].empty:
            print("\nMismatched Price Details:")
            print(accuracy_results['mismatched_details'])
        else:
            print("\nNo mismatched prices found.")

    print("\n--- Test Case: No common products ---")
    df_company_no_common = pd.DataFrame({'product_id': ['X001'], 'price': [100]})
    df_trusted_no_common = pd.DataFrame({'product_id': ['Y001'], 'price': [200]})
    df_company_no_common.to_csv('company_no_common.csv', index=False)
    df_trusted_no_common.to_csv('trusted_no_common.csv', index=False)
    accuracy_no_common = measure_price_accuracy('company_no_common.csv', 'trusted_no_common.csv')
    if accuracy_no_common:
        print(f"Total Products Compared: {accuracy_no_common['total_products_compared']}")
        print(f"Accuracy Percentage: {accuracy_no_common['accuracy_percentage']:.2f}%")


    print("\n--- Test Case: Missing column ---")
    df_bad_company = pd.DataFrame({'product_id': ['Z001'], 'cost': [50]}) # Missing 'price'
    df_bad_company.to_csv('bad_company.csv', index=False)
    accuracy_bad_col = measure_price_accuracy('bad_company.csv', trusted_file)
    print(f"Attempting to process 'bad_company.csv':")
    # Expected: Error message and None


    # Clean up dummy files
    import os
    os.remove(company_file)
    os.remove(trusted_file)
    os.remove('company_no_common.csv')
    os.remove('trusted_no_common.csv')
    os.remove('bad_company.csv')

Created 'company_prices.csv':
  product_id  price
0       A001  10.50
1       A002  20.00
2       A003  30.25
3       A004  40.00
4       A005  50.00

Created 'trusted_prices.csv':
  product_id  price
0       A001  10.50
1       A002  21.00
2       A003  30.25
3       A004  40.50
4       A006  60.00


--- Price Accuracy Results ---
Total Products Compared: 4
Matching Prices: 2
Mismatched Prices: 2
Accuracy Percentage: 50.00%

Mismatched Price Details:
  product_id  price_company  price_trusted
1       A002           20.0           21.0
3       A004           40.0           40.5

--- Test Case: No common products ---
No common products found between the two datasets for comparison.
Total Products Compared: 0
Accuracy Percentage: 0.00%

--- Test Case: Missing column ---
Error: 'bad_company.csv' must contain 'product_id' and 'price' columns.
Attempting to process 'bad_company.csv':


### Task 2: Detect Incorrect Values

**Description**: In `company_prices.csv` , detect any negative price values which are incorrect values for prices.

In [2]:
import pandas as pd

def detect_negative_prices(company_prices_path):
    """
    Detects and returns rows with negative price values in 'company_prices.csv'.

    Assumes the CSV file has a 'price' column.

    Args:
        company_prices_path (str): The file path to the company's price data CSV.

    Returns:
        pandas.DataFrame: A DataFrame containing rows where the 'price' is negative.
                          Returns an empty DataFrame if no negative prices are found,
                          or if the file cannot be read, or if the 'price' column is missing.
    """
    try:
        company_df = pd.read_csv(company_prices_path)
    except FileNotFoundError:
        print(f"Error: The file '{company_prices_path}' not found. Please check the path.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return pd.DataFrame()

    # Check if 'price' column exists
    if 'price' not in company_df.columns:
        print(f"Error: The file '{company_prices_path}' must contain a 'price' column.")
        return pd.DataFrame()

    # Detect rows where 'price' is less than 0
    incorrect_prices_df = company_df[company_df['price'] < 0]

    if incorrect_prices_df.empty:
        print("No negative price values detected.")
    else:
        print("Negative price values detected:")

    return incorrect_prices_df

# --- Example Usage (requires creating a dummy CSV file) ---
if __name__ == "__main__":
    # Create a dummy company_prices.csv for testing
    company_data_with_negatives = {
        'product_id': ['P001', 'P002', 'P003', 'P004', 'P005'],
        'price': [10.50, -5.00, 30.25, -0.01, 50.00]
    }
    df_company_negatives = pd.DataFrame(company_data_with_negatives)
    df_company_negatives.to_csv('company_prices.csv', index=False)

    print("--- Detecting Incorrect (Negative) Price Values ---")

    negative_prices_found_df = detect_negative_prices('company_prices.csv')
    print(negative_prices_found_df)

    print("\n--- Test Case 2: No negative prices ---")
    company_data_no_negatives = {
        'product_id': ['Q001', 'Q002', 'Q003'],
        'price': [100.0, 20.0, 5.0]
    }
    df_company_no_negatives = pd.DataFrame(company_data_no_negatives)
    df_company_no_negatives.to_csv('company_prices_no_negatives.csv', index=False)

    no_negative_prices_df = detect_negative_prices('company_prices_no_negatives.csv')
    print(no_negative_prices_df)

    print("\n--- Test Case 3: Missing 'price' column ---")
    company_data_missing_col = {
        'product_id': ['R001', 'R002'],
        'cost': [10.0, 20.0]
    }
    df_company_missing_col = pd.DataFrame(company_data_missing_col)
    df_company_missing_col.to_csv('company_prices_missing_col.csv', index=False)

    missing_col_prices_df = detect_negative_prices('company_prices_missing_col.csv')
    print(missing_col_prices_df)

    print("\n--- Test Case 4: Non-existent file ---")
    non_existent_prices_df = detect_negative_prices('non_existent_company_prices.csv')
    print(non_existent_prices_df)


    # Clean up dummy files
    import os
    os.remove('company_prices.csv')
    os.remove('company_prices_no_negatives.csv')
    os.remove('company_prices_missing_col.csv')

--- Detecting Incorrect (Negative) Price Values ---
Negative price values detected:
  product_id  price
1       P002  -5.00
3       P004  -0.01

--- Test Case 2: No negative prices ---
No negative price values detected.
Empty DataFrame
Columns: [product_id, price]
Index: []

--- Test Case 3: Missing 'price' column ---
Error: The file 'company_prices_missing_col.csv' must contain a 'price' column.
Empty DataFrame
Columns: []
Index: []

--- Test Case 4: Non-existent file ---
Error: The file 'non_existent_company_prices.csv' not found. Please check the path.
Empty DataFrame
Columns: []
Index: []


### Task 3: Check Missing Data Rates

**Description**: Calculate the percentage of missing values in `customer_data.csv` .

In [3]:
import pandas as pd
import numpy as np # Although Pandas is primary, np.nan is used for demonstration

def calculate_missing_data_rate(file_path):
    """
    Calculates the percentage of missing values in a given CSV file.

    Args:
        file_path (str): The path to the CSV file (e.g., 'customer_data.csv').

    Returns:
        float: The overall percentage of missing values in the dataset (0.0 to 100.0).
               Returns None if the file cannot be read.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' not found. Please check the path.")
        return None
    except pd.errors.EmptyDataError:
        print(f"Warning: The file '{file_path}' is empty. Missing data rate is 0.0%.")
        return 0.0
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    if df.empty:
        print(f"Warning: The DataFrame loaded from '{file_path}' is empty. Missing data rate is 0.0%.")
        return 0.0

    # Calculate the total number of cells in the DataFrame
    total_cells = df.size

    if total_cells == 0: # Handle case of 0 columns but some rows, or vice-versa, resulting in size 0
        print(f"Warning: The DataFrame loaded from '{file_path}' has no usable cells. Missing data rate is 0.0%.")
        return 0.0

    # Count the number of missing values (NaN) across the entire DataFrame
    missing_cells = df.isnull().sum().sum()

    # Calculate the percentage of missing values
    missing_percentage = (missing_cells / total_cells) * 100

    return float(missing_percentage)

# --- Example Usage (requires creating a dummy CSV file) ---
if __name__ == "__main__":
    # Create a dummy customer_data.csv for testing
    # Data with some missing values (represented as NaN)
    customer_data = {
        'customer_id': [1, 2, 3, 4, 5],
        'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'email': ['alice@example.com', np.nan, 'charlie@example.com', 'david@example.com', np.nan],
        'phone': ['111-222-3333', '444-555-6666', np.nan, '777-888-9999', '000-111-2222'],
        'age': [25, 30, np.nan, 40, 22]
    }
    df_customer = pd.DataFrame(customer_data)
    df_customer.to_csv('customer_data.csv', index=False)

    print("--- Checking Missing Data Rates ---")

    missing_rate_1 = calculate_missing_data_rate('customer_data.csv')
    if missing_rate_1 is not None:
        print(f"\nMissing data rate in 'customer_data.csv': {missing_rate_1:.2f}%")
    # Expected: 3 NaNs out of 25 total cells (5 rows * 5 columns) = 12.00%

    print("\n--- Test Case 2: File with no missing values ---")
    customer_data_no_nan = {
        'customer_id': [10, 11],
        'name': ['Frank', 'Grace'],
        'email': ['frank@example.com', 'grace@example.com']
    }
    df_no_nan = pd.DataFrame(customer_data_no_nan)
    df_no_nan.to_csv('customer_data_no_nan.csv', index=False)
    missing_rate_2 = calculate_missing_data_rate('customer_data_no_nan.csv')
    if missing_rate_2 is not None:
        print(f"Missing data rate in 'customer_data_no_nan.csv': {missing_rate_2:.2f}%")
    # Expected: 0.00%

    print("\n--- Test Case 3: File with all missing values ---")
    customer_data_all_nan = {
        'col1': [np.nan, np.nan],
        'col2': [np.nan, np.nan]
    }
    df_all_nan = pd.DataFrame(customer_data_all_nan)
    df_all_nan.to_csv('customer_data_all_nan.csv', index=False)
    missing_rate_3 = calculate_missing_data_rate('customer_data_all_nan.csv')
    if missing_rate_3 is not None:
        print(f"Missing data rate in 'customer_data_all_nan.csv': {missing_rate_3:.2f}%")
    # Expected: 100.00%

    print("\n--- Test Case 4: Non-existent file ---")
    missing_rate_4 = calculate_missing_data_rate('non_existent_customer_data.csv')
    if missing_rate_4 is not None:
        print(f"Missing data rate in 'non_existent_customer_data.csv': {missing_rate_4:.2f}%")

    print("\n--- Test Case 5: Empty CSV file ---")
    # Create an empty file
    open('empty_customer_data.csv', 'w').close()
    missing_rate_5 = calculate_missing_data_rate('empty_customer_data.csv')
    if missing_rate_5 is not None:
        print(f"Missing data rate in 'empty_customer_data.csv': {missing_rate_5:.2f}%")

    # Clean up dummy files
    import os
    os.remove('customer_data.csv')
    os.remove('customer_data_no_nan.csv')
    os.remove('customer_data_all_nan.csv')
    os.remove('empty_customer_data.csv')

--- Checking Missing Data Rates ---

Missing data rate in 'customer_data.csv': 16.00%

--- Test Case 2: File with no missing values ---
Missing data rate in 'customer_data_no_nan.csv': 0.00%

--- Test Case 3: File with all missing values ---
Missing data rate in 'customer_data_all_nan.csv': 100.00%

--- Test Case 4: Non-existent file ---
Error: The file 'non_existent_customer_data.csv' not found. Please check the path.

--- Test Case 5: Empty CSV file ---
Missing data rate in 'empty_customer_data.csv': 0.00%


### Task 4: Handling Partially Available Records

**Description**: In `customer_data.csv` , identify records with missing "email" or "phone number" and decide whether to drop or fill them.

In [4]:
import pandas as pd
import numpy as np # Used for np.nan in dummy data

def handle_partially_available_records(file_path, strategy='drop', fill_value=None):
    """
    Identifies records with missing 'email' or 'phone number' in a CSV file
    and applies a specified handling strategy (drop or fill).

    Args:
        file_path (str): The path to the CSV file (e.g., 'customer_data.csv').
        strategy (str): The strategy to apply. Can be 'drop' or 'fill'.
                        Defaults to 'drop'.
        fill_value (str, int, float, or dict): The value(s) to use for filling missing data.
                                              - If 'fill', this can be a single value
                                                to fill all specified missing columns,
                                                or a dictionary mapping column names
                                                to fill values (e.g., {'email': 'no_email', 'phone': 'no_phone'}).
                                              - Only relevant if strategy is 'fill'.

    Returns:
        pandas.DataFrame: The DataFrame after applying the handling strategy.
                          Returns an empty DataFrame if the file cannot be read,
                          or if required columns are missing.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' not found. Please check the path.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return pd.DataFrame()

    required_cols = ['email', 'phone']
    if not all(col in df.columns for col in required_cols):
        print(f"Error: The DataFrame must contain '{required_cols[0]}' and '{required_cols[1]}' columns.")
        return pd.DataFrame()

    print(f"\nOriginal DataFrame ({file_path}):")
    print(df)
    print("\nRecords with missing 'email' or 'phone':")

    # Identify records with missing 'email' or 'phone'
    missing_email_or_phone = df[df['email'].isnull() | df['phone'].isnull()]
    if missing_email_or_phone.empty:
        print("No records with missing 'email' or 'phone' found.")
        return df # Return original if nothing to handle

    print(missing_email_or_phone)

    processed_df = df.copy() # Work on a copy to avoid modifying original df directly

    if strategy == 'drop':
        print(f"\nStrategy: Dropping records with missing 'email' or 'phone'.")
        # Drop rows where 'email' OR 'phone' is NaN
        processed_df = df.dropna(subset=['email', 'phone'], how='any')
    elif strategy == 'fill':
        print(f"\nStrategy: Filling missing 'email' or 'phone' with: {fill_value}")
        if isinstance(fill_value, dict):
            # Fill specified columns with dictionary values
            processed_df = df.fillna(fill_value)
        else:
            # Fill all specified columns with a single fill_value
            processed_df = df.fillna(dict.fromkeys(required_cols, fill_value))
    else:
        print(f"Error: Invalid strategy '{strategy}'. Use 'drop' or 'fill'.")
        return pd.DataFrame()

    return processed_df

# --- Example Usage (requires creating a dummy CSV file) ---
if __name__ == "__main__":
    # Create a dummy customer_data.csv for testing
    customer_data = {
        'customer_id': [1, 2, 3, 4, 5, 6],
        'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
        'email': ['alice@example.com', np.nan, 'charlie@example.com', 'david@example.com', np.nan, 'frank@example.com'],
        'phone': ['111-222-3333', '444-555-6666', np.nan, '777-888-9999', '000-111-2222', np.nan],
        'age': [25, 30, 35, 40, 22, 50]
    }
    df_customer = pd.DataFrame(customer_data)
    df_customer.to_csv('customer_data.csv', index=False)

    print("--- Task 4: Handling Partially Available Records ---")

    # --- Scenario 1: Drop records ---
    print("\n--- Scenario 1: Dropping records with missing email or phone ---")
    processed_df_drop = handle_partially_available_records('customer_data.csv', strategy='drop')
    print("\nDataFrame after dropping:")
    print(processed_df_drop)
    # Expected: Rows for Bob (email NaN), Charlie (phone NaN), Eve (email NaN), Frank (phone NaN) dropped.
    # Only Alice and David should remain.

    # --- Scenario 2: Fill records with a default string ---
    print("\n--- Scenario 2: Filling missing records with 'N/A' ---")
    processed_df_fill_str = handle_partially_available_records('customer_data.csv', strategy='fill', fill_value='N/A')
    print("\nDataFrame after filling with 'N/A':")
    print(processed_df_fill_str)
    # Expected: Missing emails/phones filled with 'N/A'.

    # --- Scenario 3: Fill records with specific values per column ---
    print("\n--- Scenario 3: Filling missing records with specific values ---")
    fill_dict = {'email': 'missing@domain.com', 'phone': '999-999-9999'}
    processed_df_fill_dict = handle_partially_available_records('customer_data.csv', strategy='fill', fill_value=fill_dict)
    print("\nDataFrame after filling with specific values:")
    print(processed_df_fill_dict)
    # Expected: Missing emails/phones filled with values from the dictionary.

    # --- Scenario 4: Non-existent file ---
    print("\n--- Scenario 4: Non-existent file ---")
    non_existent_df = handle_partially_available_records('non_existent_customer.csv', strategy='drop')
    print("Result for non-existent file:", non_existent_df)

    # --- Scenario 5: Missing required columns in the file ---
    print("\n--- Scenario 5: Missing required columns ---")
    pd.DataFrame({'id': [1], 'name': ['Test']}).to_csv('bad_customer_data.csv', index=False)
    missing_cols_df = handle_partially_available_records('bad_customer_data.csv', strategy='drop')
    print("Result for missing columns:", missing_cols_df)


    # Clean up dummy files
    import os
    os.remove('customer_data.csv')
    os.remove('bad_customer_data.csv')

--- Task 4: Handling Partially Available Records ---

--- Scenario 1: Dropping records with missing email or phone ---

Original DataFrame (customer_data.csv):
   customer_id     name                email         phone  age
0            1    Alice    alice@example.com  111-222-3333   25
1            2      Bob                  NaN  444-555-6666   30
2            3  Charlie  charlie@example.com           NaN   35
3            4    David    david@example.com  777-888-9999   40
4            5      Eve                  NaN  000-111-2222   22
5            6    Frank    frank@example.com           NaN   50

Records with missing 'email' or 'phone':
   customer_id     name                email         phone  age
1            2      Bob                  NaN  444-555-6666   30
2            3  Charlie  charlie@example.com           NaN   35
4            5      Eve                  NaN  000-111-2222   22
5            6    Frank    frank@example.com           NaN   50

Strategy: Dropping records wi