In [1]:
import pandas as pd
import csv
import os


In [None]:
file_path = "standardcharges.csv"

with open(file_path, 'r', encoding='utf-8') as f:
    for i in range(2000):
        print(f.readline().strip())


In [11]:
with open(".csv", "r", encoding="utf-8") as f:
    for i in range(5):
        print(f.readline().strip())

hospital_name,last_updated_on,version,hospital_location,hospital_address,financial_aid_policy,license_number|NC,"To the best of its knowledge and belief, the hospital has included all applicable standard charge information in accordance with the requirements of 45 CFR 180.50, and the information encoded is true, accurate, and complete as of the date indicated.",general_contract_provisions
The Charlotte Mecklenburg Hospital Authority,2024-12-01,2.0.0,Atrium Health Anson,"2301 US Hwy 74 West, Wadesboro, NC 28170",,H0082,TRUE,
description,code|1,code|1|type,code|2,code|2|type,code|3,code|3|type,code|4,code|4|type,billing_class,setting,drug_unit_of_measurement,drug_type_of_measurement,modifiers,standard_charge|gross,standard_charge|discounted_cash,payer_name,plan_name,standard_charge|negotiated_dollar,standard_charge|negotiated_percentage,standard_charge|negotiated_algorithm,estimated_amount,standard_charge|methodology,standard_charge|min,standard_charge|max,additional_generic_notes,additi

Loading data into smaller chunked files

In [None]:
import pandas as pd
import os

# === CONFIG ===
input_file = "standardcharges.csv"
output_dir = "cleaned_chunks/"
header_row = 2
rows_per_file = 100000  


os.makedirs(output_dir, exist_ok=True)


columns_to_drop = [
    "code|3", "code|3|type", "code|4", "code|4|type",
    "billing_class", "setting", "drug_unit_of_measurement", "drug_type_of_measurement",
    "modifiers", "standard_charge|gross", "standard_charge|discounted_cash",
    "standard_charge|negotiated_algorithm", "estimated_amount", "standard_charge|methodology",
    "additional_generic_notes", "additional_payer_notes"
]

# Final column name mapping
columns_mapping = {
    'description': 'DESCRIPTION',
    'code|1': 'CODE',
    'code|1|type': 'CODE_TYPE',
    'payer_name': 'PAYER_NAME',
    'plan_name': 'PLAN_NAME',
    'standard_charge|negotiated_dollar': 'STANDARD_CHARGE_DOLLAR',
    'standard_charge|min': 'MINIMUM_CHARGE',
    'standard_charge|max': 'MAXIMUM_CHARGE'
}

# CHUNK PROCESSING
def process_chunk(chunk):
    
    mask = (chunk["code|1|type"] == "CDM") & (chunk["code|2|type"] == "CPT")
    chunk.loc[mask, "code|1"] = chunk.loc[mask, "code|2"]
    chunk.loc[mask, "code|1|type"] = "CPT"

    
    chunk = chunk[chunk["code|1|type"] == "CPT"]

   
    chunk = chunk.drop(columns=["code|2", "code|2|type"], errors="ignore")
    chunk = chunk.drop(columns=[col for col in columns_to_drop if col in chunk.columns])

    
    dollar_col = "standard_charge|negotiated_dollar"
    percent_col = "standard_charge|negotiated_percentage"
    max_col = "standard_charge|max"

    chunk[dollar_col] = pd.to_numeric(chunk[dollar_col], errors="coerce")
    chunk[percent_col] = pd.to_numeric(chunk[percent_col], errors="coerce")
    chunk[max_col] = pd.to_numeric(chunk[max_col], errors="coerce")

   
    missing_both = chunk[dollar_col].isna() & chunk[percent_col].isna()
    chunk = chunk[~missing_both]

    
    missing_dollar = chunk[dollar_col].isna() & chunk[percent_col].notna() & chunk[max_col].notna()
    chunk.loc[missing_dollar, dollar_col] = (
        chunk.loc[missing_dollar, percent_col] / 100 * chunk.loc[missing_dollar, max_col]
    )

   
    chunk = chunk.drop(columns=[percent_col], errors="ignore")

   
    chunk.rename(columns={k: v for k, v in columns_mapping.items() if k in chunk.columns}, inplace=True)

    return chunk


reader = pd.read_csv(input_file, header=header_row, chunksize=100000, low_memory=False)
buffer = []
total_rows = 0
file_index = 0

for chunk in reader:
    cleaned = process_chunk(chunk)
    if cleaned.empty:
        continue

    buffer.append(cleaned)
    total_rows += len(cleaned)

    if total_rows >= rows_per_file:
        combined = pd.concat(buffer)

        out_file = f"{output_dir}/cleaned_part_{file_index}.csv"
        combined.to_csv(out_file, index=False)
        print(f"Saved: {out_file} ({len(combined)} rows)")

        buffer = []
        total_rows = 0
        file_index += 1

# Save any remaining rows (same format)
if buffer:
    combined = pd.concat(buffer)

    out_file = f"{output_dir}/cleaned_part_{file_index}.csv"
    combined.to_csv(out_file, index=False)
    print(f"Saved: {out_file} ({len(combined)} rows)")

print("All processing complete. Chunked files saved consistently in:", output_dir)


✅ Saved: charlotte_mecklenburg_hospital_cleaned_chunks//charlotte_mecklenburg_hospital_cleaned_part_0.csv (101895 rows)
✅ Saved: charlotte_mecklenburg_hospital_cleaned_chunks//charlotte_mecklenburg_hospital_cleaned_part_1.csv (103316 rows)
✅ Saved: charlotte_mecklenburg_hospital_cleaned_chunks//charlotte_mecklenburg_hospital_cleaned_part_2.csv (101199 rows)
✅ Saved: charlotte_mecklenburg_hospital_cleaned_chunks//charlotte_mecklenburg_hospital_cleaned_part_3.csv (102613 rows)
✅ Saved: charlotte_mecklenburg_hospital_cleaned_chunks//charlotte_mecklenburg_hospital_cleaned_part_4.csv (7995 rows)
✅ All processing complete. Chunked files saved consistently in: charlotte_mecklenburg_hospital_cleaned_chunks/


Loading data into a single file

In [None]:
import pandas as pd
import os


input_file = "standardcharges.csv"
output_dir = "cleaned/"
output_file = os.path.join(output_dir, "cleaned_all.csv")
header_row = 2
chunk_size = 100000


os.makedirs(output_dir, exist_ok=True)


columns_to_drop = [
    "code|3", "code|3|type", "code|4", "code|4|type",
    "billing_class", "setting", "drug_unit_of_measurement", "drug_type_of_measurement",
    "modifiers", "standard_charge|gross", "standard_charge|discounted_cash",
    "standard_charge|negotiated_algorithm", "estimated_amount", "standard_charge|methodology",
    "additional_generic_notes", "additional_payer_notes"
]


columns_mapping = {
    'description': 'DESCRIPTION',
    'code|1': 'CODE',
    'code|1|type': 'CODE_TYPE',
    'payer_name': 'PAYER_NAME',
    'plan_name': 'PLAN_NAME',
    'standard_charge|negotiated_dollar': 'STANDARD_CHARGE_DOLLAR',
    'standard_charge|min': 'MINIMUM_CHARGE',
    'standard_charge|max': 'MAXIMUM_CHARGE',
    'standard_charge|negotiated_percentage': 'STANDARD_CHARGE_PERCENTAGE'
}


def process_chunk(chunk, chunk_number):
    
    mask = (chunk["code|1|type"] == "CDM") & (chunk["code|2|type"] == "CPT")
    chunk.loc[mask, "code|1"] = chunk.loc[mask, "code|2"]
    chunk.loc[mask, "code|1|type"] = "CPT"

    
    chunk = chunk[chunk["code|1|type"] == "CPT"]

    
    chunk = chunk.drop(columns=["code|2", "code|2|type"], errors="ignore")
    chunk = chunk.drop(columns=[col for col in columns_to_drop if col in chunk.columns], errors="ignore")

    
    dollar_col = "standard_charge|negotiated_dollar"
    percent_col = "standard_charge|negotiated_percentage"
    max_col = "standard_charge|max"

    chunk[dollar_col] = pd.to_numeric(chunk[dollar_col], errors="coerce")
    chunk[percent_col] = pd.to_numeric(chunk[percent_col], errors="coerce").fillna(0)
    chunk[max_col] = pd.to_numeric(chunk[max_col], errors="coerce")

    
    missing_dollar = chunk[dollar_col].isna() & chunk[max_col].notna()
    chunk.loc[missing_dollar, dollar_col] = (
        chunk.loc[missing_dollar, percent_col] / 100 * chunk.loc[missing_dollar, max_col]
    )

    
    chunk.rename(columns={k: v for k, v in columns_mapping.items() if k in chunk.columns}, inplace=True)

    print(f"Processed chunk #{chunk_number} with {len(chunk)} rows.")
    return chunk


reader = pd.read_csv(input_file, header=header_row, chunksize=chunk_size, low_memory=False)
all_cleaned = []
chunk_number = 0

for chunk in reader:
    chunk_number += 1
    cleaned = process_chunk(chunk, chunk_number)
    if not cleaned.empty:
        all_cleaned.append(cleaned)

# Combining all chunks and write to a single file
if all_cleaned:
    final_df = pd.concat(all_cleaned, ignore_index=True)
    final_df.to_csv(output_file, index=False)
    print(f"All chunks processed. Final file saved as:\n{output_file}")
    print(f"Total rows saved: {len(final_df)}")
else:
    print("No valid data to save.")


✅ Processed chunk #1 with 3081 rows.
✅ Processed chunk #2 with 3225 rows.
✅ Processed chunk #3 with 3147 rows.
✅ Processed chunk #4 with 3118 rows.
✅ Processed chunk #5 with 3042 rows.
✅ Processed chunk #6 with 3050 rows.
✅ Processed chunk #7 with 3893 rows.
✅ Processed chunk #8 with 4035 rows.
✅ Processed chunk #9 with 5349 rows.
✅ Processed chunk #10 with 4565 rows.
✅ Processed chunk #11 with 12455 rows.
✅ Processed chunk #12 with 3066 rows.
✅ Processed chunk #13 with 3157 rows.
✅ Processed chunk #14 with 3125 rows.
✅ Processed chunk #15 with 3097 rows.
✅ Processed chunk #16 with 3106 rows.
✅ Processed chunk #17 with 3099 rows.
✅ Processed chunk #18 with 3076 rows.
✅ Processed chunk #19 with 3897 rows.
✅ Processed chunk #20 with 3372 rows.
✅ Processed chunk #21 with 11801 rows.
✅ Processed chunk #22 with 3127 rows.
✅ Processed chunk #23 with 3141 rows.
✅ Processed chunk #24 with 3150 rows.
✅ Processed chunk #25 with 3046 rows.
✅ Processed chunk #26 with 3189 rows.
✅ Processed chunk #