## Read Data from CSV and Calculate DQI

**Description**: Read data from a CSV file, identify errors as missing values, and calculate the DQI.

In [4]:
import pandas as pd
import numpy as np

def calculate_dqi_from_csv(file_path):
    """
    Reads data from a CSV file, identifies missing values as errors,
    and calculates a simple Data Quality Index (DQI).

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        float or None: The Data Quality Index (DQI) as a percentage (0-100),
                       or None if the file cannot be read.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None

    total_records = df.size  # Total number of elements in the DataFrame
    missing_values_count = df.isnull().sum().sum()  # Total count of missing values

    if total_records == 0:
        print("Warning: The CSV file is empty.")
        return 100.0  # Consider DQI as 100% for an empty file

    valid_records_count = total_records - missing_values_count
    dqi = (valid_records_count / total_records) * 100

    print(f"Total Records: {total_records}")
    print(f"Number of Missing Values (Errors): {missing_values_count}")
    print(f"Data Quality Index (DQI): {dqi:.2f}%")

    return dqi

# Example usage: Replace 'your_data.csv' with the actual path to your CSV file
file_path = 'your_data.csv'
dqi_value = calculate_dqi_from_csv(file_path)

if dqi_value is not None:
    print(f"Calculated DQI: {dqi_value:.2f}%")

Error: File not found at 'your_data.csv'


### Visualize Basic DQI with Bar Plot

**Description**: Create a bar plot for DQI and errors in a dataset.

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

def visualize_dqi(file_path):
    """
    Reads data from a CSV file, calculates DQI and error metrics,
    and visualizes them using a bar plot.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return

    total_records = df.size
    missing_values_count = df.isnull().sum().sum()

    if total_records == 0:
        print("Warning: The CSV file is empty. Cannot visualize DQI.")
        return

    valid_records_count = total_records - missing_values_count
    dqi = (valid_records_count / total_records) * 100

    # Data for the bar plot
    metrics = ['DQI (%)', 'Errors', 'Total Records']
    values = [dqi, missing_values_count, total_records]

    # Create the bar plot
    plt.figure(figsize=(8, 6))
    plt.bar(metrics, values, color=['green', 'red', 'blue'])  # Customize colors
    plt.ylabel('Value')
    plt.title('Data Quality Metrics')
    plt.ylim(0, max(values) * 1.1)  # Set y-axis limit slightly above the max value

    # Add value labels to the bars
    for i, value in enumerate(values):
        plt.text(i, value + max(values) * 0.02, f'{value:.2f}' if i == 0 else f'{int(value)}', ha='center', va='bottom')

    plt.show()

# Example usage:
file_path = 'your_data.csv'  # Replace with your CSV file path
visualize_dqi(file_path)

Error: File not found at 'your_data.csv'
