### Task 1: Measure Data Accuracy using a Trusted Source

**Description**: You have two datasets of product prices: `company_prices.csv` and
`trusted_prices.csv` . Check if the prices in `company_prices.csv` match the prices in
`trusted_prices.csv` . Assume both files have a "product_id" and "price" column.

In [1]:
# Write your code from here
# Write your code from here
import pandas as pd

def measure_price_accuracy(company_file, trusted_file):
    """
    Compares product prices between a company's dataset and a trusted source.

    Args:
        company_file (str): Path to the CSV file containing company prices.
        trusted_file (str): Path to the CSV file containing trusted prices.

    Returns:
        pandas.DataFrame: A DataFrame showing the comparison results, including
                          product_id, company_price, trusted_price, and a boolean
                          indicating if the prices match.
    """
    try:
        company_prices_df = pd.read_csv(company_file)
        trusted_prices_df = pd.read_csv(trusted_file)
    except FileNotFoundError as e:
        print(f"Error: One or both files not found: {e}")
        return None

    # Merge the two DataFrames based on product_id
    merged_df = pd.merge(company_prices_df, trusted_prices_df, on='product_id', suffixes=('_company', '_trusted'))

    if merged_df.empty:
        print("No matching product IDs found between the two files.")
        return merged_df

    # Compare the prices
    merged_df['prices_match'] = merged_df['price_company'] == merged_df['price_trusted']

    # Select and return the relevant columns
    comparison_df = merged_df[['product_id', 'price_company', 'price_trusted', 'prices_match']]

    return comparison_df

# Create dummy CSV files for demonstration
company_data = {'product_id': [101, 102, 103, 104, 105],
                'price': [25.00, 50.50, 12.75, 75.00, 30.25]}
trusted_data = {'product_id': [101, 102, 103, 106, 105],
                'price': [25.00, 50.50, 15.00, 80.00, 30.25]}

company_df = pd.DataFrame(company_data)
trusted_df = pd.DataFrame(trusted_data)

company_df.to_csv('company_prices.csv', index=False)
trusted_df.to_csv('trusted_prices.csv', index=False)

# Measure the price accuracy
accuracy_report = measure_price_accuracy('company_prices.csv', 'trusted_prices.csv')

if accuracy_report is not None:
    print(accuracy_report)

    # Calculate overall accuracy
    overall_accuracy = accuracy_report['prices_match'].mean() * 100
    print(f"\nOverall Price Accuracy: {overall_accuracy:.2f}%")

   product_id  price_company  price_trusted  prices_match
0         101          25.00          25.00          True
1         102          50.50          50.50          True
2         103          12.75          15.00         False
3         105          30.25          30.25          True

Overall Price Accuracy: 75.00%


### Task 2: Detect Incorrect Values

**Description**: In `company_prices.csv` , detect any negative price values which are incorrect values for prices.

In [2]:
# Write your code from here
# Write your code from here
import pandas as pd

def detect_incorrect_prices(company_file):
    """
    Detects negative price values in a company's product prices dataset.

    Args:
        company_file (str): Path to the CSV file containing company prices.

    Returns:
        pandas.DataFrame: A DataFrame containing rows with negative price values,
                          or None if the file is not found or no incorrect prices are found.
    """
    try:
        company_prices_df = pd.read_csv(company_file)
    except FileNotFoundError as e:
        print(f"Error: File not found: {e}")
        return None

    # Check if the 'price' column exists
    if 'price' not in company_prices_df.columns:
        print("Error: 'price' column not found in the CSV file.")
        return None

    # Detect rows where the price is less than 0
    incorrect_prices_df = company_prices_df[company_prices_df['price'] < 0]

    if not incorrect_prices_df.empty:
        print("Detected incorrect (negative) price values:")
        return incorrect_prices_df
    else:
        print("No incorrect (negative) price values found.")
        return pd.DataFrame() # Return an empty DataFrame if no incorrect prices

# Create a dummy CSV file with some negative prices for demonstration
company_data_with_errors = {'product_id': [201, 202, 203, 204, 205],
                             'price': [10.00, -5.50, 20.25, -1.00, 30.00]}
company_df_with_errors = pd.DataFrame(company_data_with_errors)
company_df_with_errors.to_csv('company_prices.csv', index=False)

# Detect incorrect prices
incorrect_prices_report = detect_incorrect_prices('company_prices.csv')

if incorrect_prices_report is not None and not incorrect_prices_report.empty:
    print(incorrect_prices_report)

Detected incorrect (negative) price values:
   product_id  price
1         202   -5.5
3         204   -1.0


### Task 3: Check Missing Data Rates

**Description**: Calculate the percentage of missing values in `customer_data.csv` .

In [3]:
# Write your code from here
# Write your code from here
import pandas as pd

def calculate_missing_data_rates(file_path):
    """
    Calculates the percentage of missing values for each column in a CSV file.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pandas.Series: A Series containing the percentage of missing values
                       for each column. Returns None if the file is not found
                       or if the DataFrame is empty.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Error: File not found: {e}")
        return None

    if df.empty:
        print("Error: The DataFrame is empty.")
        return None

    missing_counts = df.isnull().sum()
    total_values = len(df)
    missing_percentage = (missing_counts / total_values) * 100
    return missing_percentage

# Create a dummy CSV file with missing data for demonstration
customer_data = {'customer_id': [1, 2, 3, 4, 5],
                 'name': ['Alice', 'Bob', None, 'Charlie', 'David'],
                 'age': [25, None, 30, 35, None],
                 'city': ['New York', 'London', 'Paris', None, 'Tokyo']}
customer_df = pd.DataFrame(customer_data)
customer_df.to_csv('customer_data.csv', index=False)

# Calculate missing data rates
missing_rates = calculate_missing_data_rates('customer_data.csv')

if missing_rates is not None:
    print("Percentage of Missing Values per Column:")
    print(missing_rates)

Percentage of Missing Values per Column:
customer_id     0.0
name           20.0
age            40.0
city           20.0
dtype: float64


### Task 4: Handling Partially Available Records

**Description**: In `customer_data.csv` , identify records with missing "email" or "phone number" and decide whether to drop or fill them.

In [None]:
# Write your code from here
# Write your code from here
import pandas as pd

def handle_partially_available_records(file_path):
    """
    Identifies records with missing "email" or "phone number" in a CSV file
    and demonstrates dropping those records. You can modify the logic to fill
    the missing values instead if desired.

    Args:
        file_path (str): Path to the CSV file containing customer data.

    Returns:
        pandas.DataFrame: The DataFrame after handling the partially available records.
                          Returns None if the file is not found or if the DataFrame is empty.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Error: File not found: {e}")
        return None
    if df.empty:
        print("Error: The DataFrame is empty.")
        return None

    # Identify records with missing "email" or "phone"
    missing_contact_info = df[df['email'].isnull() | df['phone'].isnull()]

    print("Records with missing 'email' or 'phone number':")
    print(missing_contact_info)

    # Option 1: Drop records with missing "email" or "phone"
    df_dropped = df.dropna(subset=['email', 'phone'])
    print("\nDataFrame after dropping records with missing 'email' or 'phone':")
    print(df_dropped)

    # Option 2: Fill missing values (example - you'll need a strategy)
    # For example, filling with a placeholder:
    # df_filled_email = df.fillna({'email': 'no_email_provided'})
    # df_filled_phone = df_filled_email.fillna({'phone': 'no_phone_provided'})
    # print("\nDataFrame after filling missing 'email' and 'phone' with placeholders:")
    # print(df_filled_phone)

    return df_dropped # Return the DataFrame after the chosen handling method

# Create a dummy CSV file with missing email and phone numbers
customer_data_partial = {'customer_id': [1, 2, 3, 4, 5, 6],
                          'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
                          'email': ['alice@example.com', None, 'charlie@example.com', None, 'eve@example.com', 'frank@example.com'],
                          'phone': ['123-456-7890', '987-654-3210', None, '555-123-4567', None, '111-222-3333']}
customer_df_partial = pd.DataFrame(customer_data_partial)
customer_df_partial.to_csv('customer_data.csv', index=False)

# Handle partially available records
handled_df = handle_partially_available_records('customer_data.csv')

Records with missing 'email' or 'phone number':
   customer_id     name                email         phone
1            2      Bob                  NaN  987-654-3210
2            3  Charlie  charlie@example.com           NaN
3            4    David                  NaN  555-123-4567
4            5      Eve      eve@example.com           NaN

DataFrame after dropping records with missing 'email' or 'phone':
   customer_id   name              email         phone
0            1  Alice  alice@example.com  123-456-7890
5            6  Frank  frank@example.com  111-222-3333
