### Task 1: Measure Data Accuracy using a Trusted Source

**Description**: You have two datasets of product prices: `company_prices.csv` and
`trusted_prices.csv` . Check if the prices in `company_prices.csv` match the prices in
`trusted_prices.csv` . Assume both files have a "product_id" and "price" column.

In [1]:
import pandas as pd

def check_price_accuracy(company_file, trusted_file, product_id_col="product_id", price_col="price"):
    """
    Checks the accuracy of product prices in one file against a trusted source.

    Args:
        company_file (str): Path to the CSV file containing the company's product prices.
        trusted_file (str): Path to the CSV file containing the trusted product prices.
        product_id_col (str, optional): Name of the product ID column. Defaults to "product_id".
        price_col (str, optional): Name of the price column. Defaults to "price".

    Returns:
        pandas.DataFrame: A DataFrame with the following columns:
            - product_id: The product ID.
            - company_price: The price from the company file.
            - trusted_price: The price from the trusted file.
            - price_match: Boolean indicating if the prices match (True) or not (False).
            - absolute_difference: Absolute difference between the two prices
        float: The price accuracy as a percentage.
    """
    try:
        # Read the CSV files into Pandas DataFrames
        company_prices_df = pd.read_csv(company_file)
        trusted_prices_df = pd.read_csv(trusted_file)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None, None  # Important: Return None, None on error

    # Check if the required columns exist in both DataFrames
    if product_id_col not in company_prices_df.columns or price_col not in company_prices_df.columns:
        print(f"Error: Missing required columns in company file.  Expected '{product_id_col}' and '{price_col}'.")
        return None, None
    if product_id_col not in trusted_prices_df.columns or price_col not in trusted_prices_df.columns:
        print(f"Error: Missing required columns in trusted file. Expected '{product_id_col}' and '{price_col}'.")
        return None, None

    # Merge the DataFrames on the product ID
    merged_df = pd.merge(company_prices_df, trusted_prices_df, on=product_id_col, suffixes=('_company', '_trusted'))

    # Calculate price match and absolute difference
    merged_df['price_match'] = merged_df[f'{price_col}_company'] == merged_df[f'{price_col}_trusted']
    merged_df['absolute_difference'] = (merged_df[f'{price_col}_company'] - merged_df[f'{price_col}_trusted']).abs()


    # Calculate price accuracy
    price_accuracy = merged_df['price_match'].mean() * 100 if not merged_df.empty else 0.0

    return merged_df, price_accuracy



def main():
    """
    Main function to run the price accuracy check and print the results.
    """
    # Provide the paths to your CSV files
    company_file = 'company_prices.csv'  # Replace with your actual file path
    trusted_file = 'trusted_prices.csv'  # Replace with your actual file path

    # Create dummy CSV files for demonstration if they don't exist
    try:
        with open(company_file, 'w') as f:
            f.write("product_id,price\n1,10.00\n2,20.00\n3,30.00\n4,40.00\n5,50.00")
        with open(trusted_file, 'w') as f:
            f.write("product_id,price\n1,10.00\n2,20.50\n3,30.00\n4,41.00\n5,50.00")
    except FileExistsError:
        pass #Handle the case where the files already exist.

    # Check price accuracy
    accuracy_df, price_accuracy = check_price_accuracy(company_file, trusted_file)

    # Print the results
    if accuracy_df is not None and price_accuracy is not None: #Check for None returns from the function
        print("Price Accuracy Check Results:")
        print(accuracy_df.to_string(index=False))  # Use to_string for better console output
        print(f"\nPrice Accuracy: {price_accuracy:.2f}%")
    else:
        print("Error occurred during price accuracy check.  Please check the file paths and column names.")

if __name__ == "__main__":
    main()


Price Accuracy Check Results:
 product_id  price_company  price_trusted  price_match  absolute_difference
          1           10.0           10.0         True                  0.0
          2           20.0           20.5        False                  0.5
          3           30.0           30.0         True                  0.0
          4           40.0           41.0        False                  1.0
          5           50.0           50.0         True                  0.0

Price Accuracy: 60.00%


### Task 2: Detect Incorrect Values

**Description**: In `company_prices.csv` , detect any negative price values which are incorrect values for prices.

In [2]:
import pandas as pd

def check_price_accuracy(company_file, trusted_file, product_id_col="product_id", price_col="price"):
    """
    Checks the accuracy of product prices in one file against a trusted source.

    Args:
        company_file (str): Path to the CSV file containing the company's product prices.
        trusted_file (str): Path to the CSV file containing the trusted product prices.
        product_id_col (str, optional): Name of the product ID column. Defaults to "product_id".
        price_col (str, optional): Name of the price column. Defaults to "price".

    Returns:
        pandas.DataFrame: A DataFrame with the following columns:
            - product_id: The product ID.
            - company_price: The price from the company file.
            - trusted_price: The price from the trusted file.
            - price_match: Boolean indicating if the prices match (True) or not (False).
            - absolute_difference: Absolute difference between the two prices
        float: The price accuracy as a percentage.
    """
    try:
        # Read the CSV files into Pandas DataFrames
        company_prices_df = pd.read_csv(company_file)
        trusted_prices_df = pd.read_csv(trusted_file)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None, None  # Important: Return None, None on error

    # Check if the required columns exist in both DataFrames
    if product_id_col not in company_prices_df.columns or price_col not in company_prices_df.columns:
        print(f"Error: Missing required columns in company file.  Expected '{product_id_col}' and '{price_col}'.")
        return None, None
    if product_id_col not in trusted_prices_df.columns or product_id_col not in trusted_prices_df.columns:
        print(f"Error: Missing required columns in trusted file. Expected '{product_id_col}' and '{price_col}'.")
        return None, None

    # Merge the DataFrames on the product ID
    merged_df = pd.merge(company_prices_df, trusted_prices_df, on=product_id_col, suffixes=('_company', '_trusted'))

    # Calculate price match and absolute difference
    merged_df['price_match'] = merged_df[f'{price_col}_company'] == merged_df[f'{price_col}_trusted']
    merged_df['absolute_difference'] = (merged_df[f'{price_col}_company'] - merged_df[f'{price_col}_trusted']).abs()


    # Calculate price accuracy
    price_accuracy = merged_df['price_match'].mean() * 100 if not merged_df.empty else 0.0

    return merged_df, price_accuracy


def detect_negative_prices(company_file, price_col="price"):
    """
    Detects negative price values in a CSV file.

    Args:
        company_file (str): Path to the CSV file containing the company's product prices.
        price_col (str, optional): Name of the price column. Defaults to "price".

    Returns:
        pandas.DataFrame: A DataFrame containing rows with negative prices, or an empty DataFrame if no negative prices are found.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        company_prices_df = pd.read_csv(company_file)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

    # Check if the price column exists
    if price_col not in company_prices_df.columns:
        print(f"Error: Missing price column in company file. Expected '{price_col}'.")
        return pd.DataFrame()  # Return an empty DataFrame on error

    # Filter for rows where the price is negative
    negative_price_df = company_prices_df[company_prices_df[price_col] < 0]
    return negative_price_df



def main():
    """
    Main function to run the price accuracy check, detect negative prices, and print the results.
    """
    # Provide the paths to your CSV files
    company_file = 'company_prices.csv'  # Replace with your actual file path
    trusted_file = 'trusted_prices.csv'  # Replace with your actual file path

    # Create dummy CSV files for demonstration if they don't exist
    try:
        with open(company_file, 'w') as f:
            f.write("product_id,price\n1,10.00\n2,20.00\n3,-30.00\n4,40.00\n5,-50.00")  # Added negative prices
        with open(trusted_file, 'w') as f:
            f.write("product_id,price\n1,10.00\n2,20.50\n3,30.00\n4,41.00\n5,50.00")
    except FileExistsError:
        pass #Handle the case where the files already exist.

    # Check price accuracy
    accuracy_df, price_accuracy = check_price_accuracy(company_file, trusted_file)

    # Print the results
    if accuracy_df is not None and price_accuracy is not None: #Check for None returns from the function
        print("Price Accuracy Check Results:")
        print(accuracy_df.to_string(index=False))
        print(f"\nPrice Accuracy: {price_accuracy:.2f}%")
    else:
        print("Error occurred during price accuracy check.  Please check the file paths and column names.")

    # Detect negative prices
    negative_prices_df = detect_negative_prices(company_file)

    # Print negative price detection results
    if not negative_prices_df.empty:
        print("\nNegative Prices Detected:")
        print(negative_prices_df.to_string(index=False))
    else:
        print("\nNo negative prices detected.")



if __name__ == "__main__":
    main()


Price Accuracy Check Results:
 product_id  price_company  price_trusted  price_match  absolute_difference
          1           10.0           10.0         True                  0.0
          2           20.0           20.5        False                  0.5
          3          -30.0           30.0        False                 60.0
          4           40.0           41.0        False                  1.0
          5          -50.0           50.0        False                100.0

Price Accuracy: 20.00%

Negative Prices Detected:
 product_id  price
          3  -30.0
          5  -50.0


### Task 3: Check Missing Data Rates

**Description**: Calculate the percentage of missing values in `customer_data.csv` .

In [3]:
import pandas as pd

def calculate_missing_data_rates(csv_file):
    """
    Calculates the percentage of missing values for each column in a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pandas.Series: A Series containing the percentage of missing values for each column.
                       Returns None if the file does not exist or other error occurs.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Calculate the number of missing values for each column
    missing_values = df.isnull().sum()

    # Calculate the percentage of missing values for each column
    total_rows = len(df)
    missing_percentage = (missing_values / total_rows) * 100 if total_rows > 0 else pd.Series(dtype='float64')

    return missing_percentage



def main():
    """
    Main function to run the missing data rate calculation and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'customer_data.csv'  # Replace with your actual file path

    # Create a dummy CSV file for demonstration if it doesn't exist
    try:
        with open(csv_file, 'w') as f:
            f.write("customer_id,name,age,city,email\n1,Alice,25,New York,alice@example.com\n2,Bob,30,London,\n3,Charlie,,Paris,charlie@example.net\n4,David,28,Tokyo,david@test.org\n5,Eve,22,,New York,eve@example.com")
    except FileExistsError:
        pass # Handle the case where the file already exists

    # Calculate missing data rates
    missing_percentage = calculate_missing_data_rates(csv_file)

    # Print the results
    if missing_percentage is not None:
        print("Missing Data Rates:")
        print(missing_percentage.to_string())  # Use to_string() for better formatting
    else:
        print("Error occurred while calculating missing data rates.")



if __name__ == "__main__":
    main()


Error reading CSV file: Error tokenizing data. C error: Expected 5 fields in line 6, saw 6

Error occurred while calculating missing data rates.


### Task 4: Handling Partially Available Records

**Description**: In `customer_data.csv` , identify records with missing "email" or "phone number" and decide whether to drop or fill them.

In [4]:
import pandas as pd

def calculate_missing_data_rates(csv_file):
    """
    Calculates the percentage of missing values for each column in a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pandas.Series: A Series containing the percentage of missing values for each column.
                       Returns None if the file does not exist or other error occurs.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Calculate the number of missing values for each column
    missing_values = df.isnull().sum()

    # Calculate the percentage of missing values for each column
    total_rows = len(df)
    missing_percentage = (missing_values / total_rows) * 100 if total_rows > 0 else pd.Series(dtype='float64')

    return missing_percentage



def handle_missing_email_phone(csv_file, drop_or_fill='drop', fill_value_email='', fill_value_phone=''):
    """
    Identifies records with missing "email" or "phone number" and either drops or fills them.

    Args:
        csv_file (str): Path to the CSV file.
        drop_or_fill (str, optional): 'drop' to drop records, 'fill' to fill missing values.
            Defaults to 'drop'.
        fill_value_email (str, optional): Value to fill missing "email" values with if drop_or_fill is 'fill'.
            Defaults to ''.
        fill_value_phone (str, optional): Value to fill missing "phone number" values with if drop_or_fill is 'fill'.
            Defaults to ''.

    Returns:
        pandas.DataFrame: The modified DataFrame after handling missing values.
                        Returns None if the file does not exist or other error occurs.
    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Check if the columns exist
    if 'email' not in df.columns and 'phone_number' not in df.columns:
        print(f"Error: Neither 'email' nor 'phone_number' columns found in the file.")
        return None
    elif 'email' not in df.columns:
        print(f"Error: 'email' column not found in the file.")
        if drop_or_fill == 'fill':
            print(f"Note: Only 'phone_number' column will be processed")
    elif 'phone_number' not in df.columns:
        print(f"Error: 'phone_number' column not found in the file.")
        if drop_or_fill == 'fill':
            print(f"Note: Only 'email' column will be processed")

    # Handle missing values based on user choice
    if drop_or_fill == 'drop':
        print("Dropping records with missing 'email' or 'phone_number'...")
        df.dropna(subset=['email', 'phone_number'], how='any', inplace=True)
    elif drop_or_fill == 'fill':
        print(f"Filling missing 'email' with '{fill_value_email}' and missing 'phone_number' with '{fill_value_phone}'...")
        if 'email' in df.columns:
            df['email'].fillna(fill_value_email, inplace=True)
        if 'phone_number' in df.columns:
            df['phone_number'].fillna(fill_value_phone, inplace=True)
    else:
        print("Error: Invalid value for 'drop_or_fill'.  Must be 'drop' or 'fill'.")
        return df  # returning the dataframe without any changes.

    return df



def main():
    """
    Main function to run the missing data rate calculation, handle missing email/phone,
    and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'customer_data.csv'  # Replace with your actual file path

    # Create a dummy CSV file
    try:
        with open(csv_file, 'w') as f:
            f.write("customer_id,name,age,city,email,phone_number\n1,Alice,25,New York,alice@example.com,123-456-7890\n2,Bob,30,London,,987-654-3210\n3,Charlie,,Paris,charlie@example.net,\n4,David,28,Tokyo,david@test.org,\n5,Eve,22,,New York,eve@example.com,555-123-4567")
    except FileExistsError:
        pass

    # Calculate missing data rates
    missing_percentage = calculate_missing_data_rates(csv_file)

    # Print the results
    if missing_percentage is not None:
        print("Missing Data Rates:")
        print(missing_percentage.to_string())
    else:
        print("Error occurred while calculating missing data rates.")

    # Handle missing email/phone numbers (Example: drop records)
    print("\nHandling missing email/phone numbers - Dropping records:")
    df_dropped = handle_missing_email_phone(csv_file, drop_or_fill='drop')
    if df_dropped is not None:
        print("DataFrame after dropping records:")
        print(df_dropped)

    # Handle missing email/phone numbers (Example: fill records)
    print("\nHandling missing email/phone numbers - Filling records:")
    df_filled = handle_missing_email_phone(csv_file, drop_or_fill='fill', fill_value_email='unknown@example.com', fill_value_phone='N/A')
    if df_filled is not None:
        print("DataFrame after filling records:")
        print(df_filled)



if __name__ == "__main__":
    main()


Error reading CSV file: Error tokenizing data. C error: Expected 6 fields in line 6, saw 7

Error occurred while calculating missing data rates.

Handling missing email/phone numbers - Dropping records:
Error reading CSV file: Error tokenizing data. C error: Expected 6 fields in line 6, saw 7


Handling missing email/phone numbers - Filling records:
Error reading CSV file: Error tokenizing data. C error: Expected 6 fields in line 6, saw 7

