# Measuring Completeness

**Activity Overview**: Evaluate data completeness by checking missing data rates and handling partially available records.

## Title: Customer Profiles

**Task**: Calculate the missing data rate for customer profiles.

**Steps**:
1. List all required fields for a complete customer profile (e.g., name, address, email,
phone number).
2. Analyze the dataset to count how many profiles have missing fields.
3. Calculate the percentage of missing data fields across all profiles.

In [1]:
import pandas as pd

def calculate_missing_data_rate(csv_file, required_fields):
    """
    Calculates the missing data rate for specified required fields in a CSV file.

    Args:
        csv_file (str): Path to the CSV file containing customer profile data.
        required_fields (list): A list of strings representing the required fields
            (e.g., ['name', 'address', 'email', 'phone_number']).

    Returns:
        pandas.DataFrame: A DataFrame showing the count and percentage of missing values for each required field,
                          or None if an error occurs.
        float: The overall percentage of missing data across all required fields and all profiles.

    """
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_file}")
        return None, None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None, None

    # Check if all required fields exist in the DataFrame
    for field in required_fields:
        if field not in df.columns:
            print(f"Error: Required field '{field}' not found in the file.")
            return None, None

    # Calculate the number of missing values for each required field
    missing_values_count = df[required_fields].isnull().sum()

    # Calculate the percentage of missing values for each required field
    total_profiles = len(df)
    missing_percentage = (missing_values_count / total_profiles) * 100

    # Create a DataFrame to store the results
    missing_data_df = pd.DataFrame({
        'Missing Count': missing_values_count,
        'Missing Percentage': missing_percentage
    })

    # Calculate the overall percentage of missing data
    total_missing_values = missing_values_count.sum()
    total_cells = total_profiles * len(required_fields)
    overall_missing_percentage = (total_missing_values / total_cells) * 100 if total_cells else 0

    return missing_data_df, overall_missing_percentage



def main():
    """
    Main function to run the missing data rate calculation and print the results.
    """
    # Provide the path to your CSV file
    csv_file = 'customer_profiles.csv'  # Replace with your actual file path

    # Define the list of required fields
    required_fields = ['name', 'address', 'email', 'phone_number']

    # Create a dummy CSV file for demonstration
    try:
        with open(csv_file, 'w') as f:
            f.write("customer_id,name,address,email,phone_number\n1,Alice,123 Main St,alice@example.com,555-1234\n2,Bob,,bob@example.com,555-5678\n3,Charlie,456 Oak Ave,,555-9012\n4,David,789 Pine Ln,david@example.com,\n5,Eve,,eve@example.com,555-2345")
    except FileExistsError:
        pass

    # Calculate missing data rate
    missing_data_df, overall_missing_percentage = calculate_missing_data_rate(csv_file, required_fields)

    # Print the results
    if missing_data_df is not None and overall_missing_percentage is not None:
        print("Missing Data Rate for Required Fields:")
        print(missing_data_df.to_string())
        print(f"\nOverall Missing Data Percentage: {overall_missing_percentage:.2f}%")
    else:
        print("Error occurred while calculating missing data rate.")



if __name__ == "__main__":
    main()


Missing Data Rate for Required Fields:
              Missing Count  Missing Percentage
name                      0                 0.0
address                   2                40.0
email                     1                20.0
phone_number              1                20.0

Overall Missing Data Percentage: 20.00%
