# Measuring Data Accuracy

**Activity Overview**: Assess data accuracy by comparing it to a trusted source and detecting incorrect values or mismatches.

## Title: Product Pricing

**Task**: Compare a dataset of product prices with the latest official price list.

**Steps**:
1. Obtain the latest product price list from the official company website.
2. Compare the dataset's product prices against the verified list.
3. Identify any discrepancies and mark them for correction.

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import logging

# Configure logging
logging.basicConfig(filename='price_discrepancy_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_official_prices_from_web(url, product_id_selector, price_selector):
    """
    Scrapes product prices from an official company website.

    Args:
        url (str): The URL of the website.
        product_id_selector (str): CSS selector for the product IDs on the website.
        price_selector (str): CSS selector for the prices on the website.

    Returns:
        dict: A dictionary of product IDs and their corresponding prices, or None on error.
    """
    try:
        # Send an HTTP request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract product IDs and prices using the provided selectors
        product_ids = [item.text.strip() for item in soup.select(product_id_selector)]
        prices = [item.text.strip() for item in soup.select(price_selector)]

        # Basic data validation: Check if lengths match
        if len(product_ids) != len(prices):
            logging.error(f"Error: Length mismatch between product IDs ({len(product_ids)}) and prices ({len(prices)}) on {url}")
            return None

        # Convert prices to numeric, handle potential errors (e.g., non-numeric prices)
        official_prices = {}
        for product_id, price_str in zip(product_ids, prices):
            try:
                price = float(price_str.replace('$', '').replace(',', ''))  # Remove '$' and ','
                official_prices[product_id] = price
            except ValueError:
                logging.warning(f"Warning: Could not convert price '{price_str}' for product ID '{product_id}' to numeric. Skipping.")
                continue  # Skip this product

        logging.info(f"Successfully retrieved {len(official_prices)} product prices from {url}")
        return official_prices

    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching data from {url}: {e}")
        return None
    except Exception as e:
        logging.error(f"Error processing data from {url}: {e}")
        return None



def compare_prices(dataset_file, official_prices, product_id_col="product_id", price_col="price"):
    """
    Compares product prices in a dataset with the official price list and identifies discrepancies.

    Args:
        dataset_file (str): Path to the CSV file containing the dataset.
        official_prices (dict): A dictionary of product IDs and their official prices.
        product_id_col (str, optional): Name of the product ID column in the dataset. Defaults to "product_id".
        price_col (str, optional): Name of the price column in the dataset. Defaults to "price".

    Returns:
        pandas.DataFrame: A DataFrame containing the discrepancies, or None on error.
    """
    try:
        # Read the dataset from the CSV file
        df = pd.read_csv(dataset_file)
        logging.info(f"Dataset '{dataset_file}' loaded successfully.")
    except FileNotFoundError:
        logging.error(f"Error: Dataset file '{dataset_file}' not found.")
        return None
    except Exception as e:
        logging.error(f"Error reading dataset file '{dataset_file}': {e}")
        return None

    # Check if the required columns exist
    if product_id_col not in df.columns or price_col not in df.columns:
        logging.error(f"Error: Missing required columns in dataset file. Expected '{product_id_col}' and '{price_col}'.")
        return None

    # Create a dictionary from the DataFrame for easier comparison
    dataset_prices = df.set_index(product_id_col)[price_col].to_dict()

    # Identify discrepancies
    discrepancies = []
    for product_id, official_price in official_prices.items():
        if product_id in dataset_prices:
            dataset_price = dataset_prices[product_id]
            if not abs(dataset_price - official_price) < 0.01:  # Allow a small tolerance
                discrepancies.append({
                    product_id_col: product_id,
                    'dataset_price': dataset_price,
                    'official_price': official_price,
                    'difference': dataset_price - official_price
                })
                logging.warning(f"Discrepancy found for product ID '{product_id}': Dataset price = {dataset_price:.2f}, Official price = {official_price:.2f}")
        else:
            logging.warning(f"Warning: Product ID '{product_id}' not found in dataset.")

    if not discrepancies:
        logging.info("No price discrepancies found.")
        return pd.DataFrame()  # Return an empty DataFrame

    discrepancy_df = pd.DataFrame(discrepancies)
    return discrepancy_df



def main():
    """
    Main function to orchestrate the price comparison process.
    """
    # 1. Obtain the latest product price list from the official company website.
    #    Replace with the actual URL and CSS selectors for the website you are targeting.
    official_prices_url = "https://www.example.com/official_prices"  # Placeholder URL
    product_id_selector = ".product-id"  # Placeholder selector
    price_selector = ".price"  # Placeholder selector

    official_prices = get_official_prices_from_web(official_prices_url, product_id_selector, price_selector)

    if official_prices is None:
        print("Failed to retrieve official prices. Please check the URL and selectors.")
        return  # Exit if we can't get the official prices

    # 2. Compare the dataset's product prices against the verified list.
    #    Replace with the actual path to your dataset CSV file.
    dataset_file = "product_prices.csv"  # Placeholder file

    # Create a dummy dataset
    try:
        with open(dataset_file, 'w') as f:
            f.write("product_id,price\n1,10.00\n2,20.50\n3,30.00\n4,45.00\n5,50.00")
    except FileExistsError:
        pass

    discrepancy_df = compare_prices(dataset_file, official_prices)

    # 3. Identify any discrepancies and mark them for correction.
    if discrepancy_df is not None:
        if not discrepancy_df.empty:
            print("Price discrepancies found:")
            print(discrepancy_df.to_string(index=False))  # Use to_string() for better console output
            #  You would add code here to update your data source (e.g., a database or CSV file)
            #  to correct the prices.  This is application-specific.
        else:
            print("No price discrepancies found.")
    else:
        print("An error occurred during price comparison.  Check the logs for details.")



if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'bs4'