In [6]:
import pandas as pd

In [2]:
file_path = "heart_attack_prediction_dataset.csv"
df = pd.read_csv(file_path)

In [3]:
def calculate_accuracy(df, column):
    # Implement your accuracy calculation logic here
    # This can involve comparing data to a ground truth or expected values
    return 100  # Placeholder value, replace with actual accuracy calculation

def calculate_validity(df, column):
    # Implement your validity calculation logic here
    # This can involve checking if data conforms to defined types or formats
    valid_values = df[column].apply(lambda x: is_valid(x))  # Implement is_valid function
    validity_percentage = (valid_values.sum() / len(df)) * 100
    return validity_percentage

def is_valid(value):
    # Implement your specific validity check logic here
    # Example: Check if the value is within a certain range or adheres to a specific format
    return True  # Placeholder value, replace with actual validity check

def calculate_correctness(df, column):
    # Implement your correctness calculation logic here
    # This can involve comparing data to a known set of correct values
    return 100  # Placeholder value, replace with actual correctness calculation

def calculate_consistency(df, column):
    # Implement your consistency calculation logic here
    # This can involve checking if data is consistent across records or features
    consistency_values = df[column].duplicated().sum()
    consistency_percentage = ((len(df) - consistency_values) / len(df)) * 100
    return consistency_percentage

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Create a DataFrame to store quality assessments
quality_df = pd.DataFrame(columns=['Feature', 'Number of Null Values', 'Completeness (%)', 'Accuracy (%)', 'Validity (%)', 'Correctness (%)', 'Consistency (%)'])

# Iterate through each column in the dataset
for column in df.columns:
    num_null_values = df[column].isnull().sum()
    completeness = 100 - (num_null_values / len(df)) * 100

    # Assess accuracy, validity, correctness, and consistency as needed
    accuracy = calculate_accuracy(df, column)  # Implement your accuracy calculation function
    validity = calculate_validity(df, column)  # Implement your validity calculation function
    correctness = calculate_correctness(df, column)  # Implement your correctness calculation function
    consistency = calculate_consistency(df, column)  # Implement your consistency calculation function

    prepare_df = {
            'Feature': [column],
            'Number of Null Values': [num_null_values],
            'Completeness (%)': [completeness],
            'Accuracy (%)': [accuracy],
            'Validity (%)': [validity],
            'Correctness (%)': [correctness],
            'Consistency (%)': [consistency]
        }

    # Append results to the quality DataFrame
    quality_df = pd.concat([quality_df, pd.DataFrame(prepare_df)], ignore_index=True)

# Display the resulting quality assessments
quality_df

Unnamed: 0,Feature,Number of Null Values,Completeness (%),Accuracy (%),Validity (%),Correctness (%),Consistency (%)
0,Patient ID,0,100.0,100,100.0,100,100.0
1,Age,0,100.0,100,100.0,100,0.833048
2,Sex,0,100.0,100,100.0,100,0.022823
3,Cholesterol,0,100.0,100,100.0,100,3.206664
4,Blood Pressure,0,100.0,100,100.0,100,44.676481
5,Heart Rate,0,100.0,100,100.0,100,0.810225
6,Diabetes,0,100.0,100,100.0,100,0.022823
7,Family History,0,100.0,100,100.0,100,0.022823
8,Smoking,0,100.0,100,100.0,100,0.022823
9,Obesity,0,100.0,100,100.0,100,0.022823


# Improve quality

Improving data quality involves a combination of data cleansing, validation, and enhancement techniques. Here are some solutions to enhance the quality of your data:

Handling Missing Values:

Identify columns with missing values and decide on an appropriate strategy (imputation, removal, etc.).
For numerical features, consider imputing missing values with the mean, median, or a regression-based approach.
For categorical features, impute missing values with the mode or use a separate category.
Outlier Detection and Treatment:

Identify and analyze outliers using statistical methods like the interquartile range (IQR) or Z-scores.
Decide whether to remove outliers or apply appropriate transformations to mitigate their impact.
Data Type and Format Validation:

Ensure that data types and formats are consistent with expectations.
Validate numerical columns to ensure they contain valid numeric values.
Validate categorical columns to ensure they contain expected categories.
Cross-Column Consistency Checks:

Check for consistency between related columns. For example, ensure that the 'Age' column aligns with the 'Birthdate' column.
Identify and rectify discrepancies between columns that should contain similar information.
Standardize and Normalize Data:

Standardize units and formats for numerical data (e.g., convert all weights to kilograms).
Normalize data when necessary, especially for machine learning models that are sensitive to the scale of features.
Addressing Duplicate Records:

Identify and remove duplicate records based on unique identifiers.
Implement checks to prevent the creation of duplicate records in the future.
Implement Data Validation Rules:

Define and enforce data validation rules specific to each feature.
For example, set range constraints on numerical features, validate date formats, and enforce unique constraints where applicable.
Documentation and Metadata:

Maintain comprehensive documentation for the dataset, including data dictionaries and metadata.
Clearly define the meaning and expected values for each feature.
Data Profiling and Exploration:

Use data profiling techniques to explore and understand the distribution of values in each column.
Identify patterns, anomalies, and potential issues that need attention.
Automated Data Quality Checks:

Implement automated data quality checks as part of your data processing pipeline.
Regularly monitor and alert on data quality issues to facilitate timely intervention.
User Training and Awareness:

Train users who interact with the data on best practices for data entry and maintenance.
Foster a culture of data quality within the organization.
Continuous Monitoring and Improvement:

Set up regular data quality monitoring processes to detect issues early.
Establish a feedback loop for continuous improvement based on data quality insights.
Remember that data quality is an ongoing process, and regular monitoring and improvement efforts are crucial to maintaining high-quality data. Tailor these solutions to the specific characteristics and requirements of your dataset and use case.