In [77]:
import pandas as pd
import numpy as np


# Import Data

In [78]:
breach_df = pd.read_csv('data/breach.csv')

df = breach_df

# Initial Exploration

In [79]:
print (df.head())
print (df.columns)
print (df.shape)
print (df.info())
print (df.describe())
print (df.isnull().sum())

                          Name of Covered Entity State  Covered Entity Type  \
0                         Langdon Prairie Health    ND  Healthcare Provider   
1              Highland Rivers Behavioral Health    GA  Healthcare Provider   
2                 Charleston Area Medical Center    WV  Healthcare Provider   
3                      Insurance ACE/Humana Inc.    KY          Health Plan   
4  Infosys Public Services, Inc. (â€œInfosysâ€)    MD   Business Associate   

   Individuals Affected Breach Submission Date  \
0                1152.0              4/18/2025   
1                2253.0              4/15/2025   
2               67413.0              2/14/2025   
3                8553.0               2/6/2025   
4                2985.0              1/31/2025   

                   Type of Breach Location of Breached Information  
0  Unauthorized Access/Disclosure                            Email  
1             Hacking/IT Incident                   Network Server  
2             Hac

# Cleaning the data

In [80]:
def clean_breach_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the breach data by standardizing column names.
        
    Args:
        df (pd.DataFrame): The breach data.
        
    Returns:
        pd.DataFrame: Cleaned breach data.
    """
    # Standardize column names
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')


    return df

In [81]:
print(clean_breach_data(df))


                                 name_of_covered_entity state  \
0                                Langdon Prairie Health    ND   
1                     Highland Rivers Behavioral Health    GA   
2                        Charleston Area Medical Center    WV   
3                             Insurance ACE/Humana Inc.    KY   
4         Infosys Public Services, Inc. (â€œInfosysâ€)    MD   
...                                                 ...   ...   
6940                                  Mark D. Lurie, MD    CA   
6941  Health Services for Children with Special Need...    DC   
6942    Alaska Department of Health and Social Services    AK   
6943          Mid America Kidney Stone Association, LLC    MO   
6944                         Brooke Army Medical Center    TX   

      covered_entity_type  individuals_affected breach_submission_date  \
0     Healthcare Provider                1152.0              4/18/2025   
1     Healthcare Provider                2253.0              4/15/2025 

In [82]:
#save the cleaned DataFrame to a new CSV file
df.to_csv('data/breach_cleaned.csv', index=False)
print("Cleaned data saved to 'data/breach_cleaned.csv'")

Cleaned data saved to 'data/breach_cleaned.csv'
