In [1]:
import pandas as pd
import re

# Date filter range
START_DATE = pd.to_datetime("2025-01-01")
END_DATE = pd.to_datetime("2026-01-31")

# -----------------------------
# Utility Functions
# -----------------------------

def standardize_headers(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
    )
    return df

def filter_date_range(df, column):
    if column in df.columns:
        df[column] = pd.to_datetime(df[column], errors='coerce')

        # Ensure START_DATE and END_DATE are UTC (recommended)
        start = pd.Timestamp(START_DATE, tz="UTC")
        end = pd.Timestamp(END_DATE, tz="UTC")

        df = df[(df[column] >= start) & (df[column] <= end)]

    return df


def convert_datetime_to_date(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce').dt.date
    return df

def extract_only_alphabets(value):
    if pd.isna(value):
        return value
    return re.sub(r'[^A-Za-z]', '', str(value))

def expand_marital_status(value):
    mapping = {
        'M': 'Married',
        'S': 'Single',
        'D': 'Divorced',
        'W': 'Widowed'
    }
    return mapping.get(value, value)

def dataset_summary(df, name):
    print(f"\n {name.upper()} SUMMARY")
    print(f"Rows: {df.shape[0]}")
    print(f"Columns: {df.shape[1]}")
    print(f"Column Names: {list(df.columns)}")


# -----------------------------
# Encounters
# -----------------------------
encounters = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Encounters.csv")
encounters = standardize_headers(encounters)
encounters = filter_date_range(encounters, "start")
encounters = convert_datetime_to_date(encounters, ["start", "stop"])
dataset_summary(encounters, "Encounters")
encounters.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Encounters.csv", index=False)

# -----------------------------
# Patients
# -----------------------------
patients = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Patients.csv")
patients = standardize_headers(patients)
patients = convert_datetime_to_date(patients, ["Birthdate", "Deathdate"])
patients = filter_date_range(encounters, "Birthdate")
# Extract alphabets from name columns
for col in ["First", "Middle", "Last", "Maiden Name"]:
    if col in patients.columns:
        patients[col] = patients[col].apply(extract_only_alphabets)

# Expand marital status
if "Marital" in patients.columns:
    patients["Marital"] = patients["Marital"].apply(expand_marital_status)

dataset_summary(patients, "Patients")
patients.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Patients.csv", index=False)

# -----------------------------
# Procedures
# -----------------------------
procedures = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Procedures.csv")
procedures = standardize_headers(procedures)
procedures = convert_datetime_to_date(procedures, ["start", "stop"])
dataset_summary(procedures, "Procedures")
procedures.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Procedures.csv", index=False)

# -----------------------------
# Providers
# -----------------------------
providers = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Providers.csv")
providers = standardize_headers(providers)
if "Name" in providers.columns:
    providers["Name"] = providers["Name"].apply(extract_only_alphabets)

dataset_summary(providers, "Providers")
providers.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Providers.csv", index=False)

# -----------------------------
# Observations
# -----------------------------
observations = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Observations.csv")
observations = standardize_headers(observations)
observations = convert_datetime_to_date(observations, ["Date"])

if "Code" in observations.columns:
    observations["Code"] = observations["Code"].str.replace("-", "", regex=False)

dataset_summary(observations, "Observations")
observations.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Observations.csv", index=False)

# -----------------------------
# Organization
# -----------------------------
organization = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Organizations.csv")
organization = standardize_headers(organization)

if "Phone" in organization.columns:
    # Take primary number if multiple exist
    organization["Phone"] = organization["Phone"].astype(str).str.split(",").str[0]
    organization["Phone"] = organization["Phone"].str.replace("-", "", regex=False)

dataset_summary(organization, "Organization")
organization.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Organizations.csv", index=False)

# -----------------------------
# Medication
# -----------------------------
medication = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Medications.csv")
medication = standardize_headers(medication)
medication = convert_datetime_to_date(medication, ["start", "stop"])
dataset_summary(medication, "Medication")
medication.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Medications.csv", index=False)

# -----------------------------
# Conditions
# -----------------------------
conditions = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Conditions.csv")
conditions = standardize_headers(conditions)
conditions = convert_datetime_to_date(conditions, ["start", "stop"])
dataset_summary(conditions, "Conditions")
conditions.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Conditions.csv", index=False)

# -----------------------------
# Careplans
# -----------------------------
careplans = pd.read_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Careplans.csv")
careplans = standardize_headers(careplans)
careplans = convert_datetime_to_date(careplans, ["start", "stop"])
dataset_summary(careplans, "Careplans")
careplans.to_csv("C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/02_CLEANED_DATASET/Cleaned_Careplans.csv", index=False)

print("âœ… All files cleaned successfully!")



 ENCOUNTERS SUMMARY
Rows: 11087
Columns: 15
Column Names: ['id', 'start', 'stop', 'patient', 'organization', 'provider', 'payer', 'encounterclass', 'code', 'description', 'base_encounter_cost', 'total_claim_cost', 'payer_coverage', 'reasoncode', 'reasondescription']

 PATIENTS SUMMARY
Rows: 11087
Columns: 15
Column Names: ['id', 'start', 'stop', 'patient', 'organization', 'provider', 'payer', 'encounterclass', 'code', 'description', 'base_encounter_cost', 'total_claim_cost', 'payer_coverage', 'reasoncode', 'reasondescription']

 PROCEDURES SUMMARY
Rows: 377323
Columns: 10
Column Names: ['start', 'stop', 'patient', 'encounter', 'system', 'code', 'description', 'base_cost', 'reasoncode', 'reasondescription']

 PROVIDERS SUMMARY
Rows: 968
Columns: 13
Column Names: ['id', 'organization', 'name', 'gender', 'speciality', 'address', 'city', 'state', 'zip', 'lat', 'lon', 'encounters', 'procedures']

 OBSERVATIONS SUMMARY
Rows: 1733858
Columns: 9
Column Names: ['date', 'patient', 'encounter', 

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/ssand/OneDrive/Documents/A - Synthea/HealthCare_Analysis dashboard/01_RAW_DATASET/Medication.csv'