This code will identify the type of delivery. 

In [6]:
import pandas as pd
import numpy as np
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import time

In [5]:
# Load vaginal delivery codes
vaginal_codes_path = r"Z:\chelsea\datalake\final_codes\vaginaldelivery_codes.csv" # Replace with actual path
vaginal_codes = pd.read_csv(vaginal_codes_path)["dx_cd"].tolist()
print("Vaginal delivery codes loaded.")

Vaginal delivery codes loaded.


In [4]:
# Load cesarean delivery codes
cesarean_codes_path = r"Z:\chelsea\datalake\final_codes\cesarean_codes.csv"
cesarean_codes = pd.read_csv(cesarean_codes_path)["dx_cd"].tolist()
print("Cesarean delivery codes loaded.")

Cesarean delivery codes loaded.


In [None]:
input_file_path =  r"Z:\chelsea\datalake\New_cohort\New_filters\opioid_in_pregnancy.csv"
output_file_path = r"Z:\chelsea\datalake\New_cohort\New_filters\cohort_delivery_type.csv"
chunk_size = 100000  # Adjust based on your system’s memory and performance

def separate_type_delivery(df, vaginal_codes, cesarean_codes):
    # dates
    df['from_dt'] = pd.to_datetime(df['from_dt'], errors='coerce')
    df['delivery_dt'] = pd.to_datetime(df['delivery_dt'], errors='coerce')

    df = df.sort_values(by=["pat_id_p", "from_dt"]).copy()
    df['type_of_delivery'] = None

    # diag columns present in df
    diag_cols_all = [f'diag{i}' for i in range(1, 13)]
    diag_cols = [c for c in diag_cols_all if c in df.columns]

    # flags (across all diag columns)
    df['any_vag_code'] = df[diag_cols].isin(set(vaginal_codes)).any(axis=1)
    df['any_ces_code'] = df[diag_cols].isin(set(cesarean_codes)).any(axis=1)

    # assign per patient within ±10 days of delivery_dt
    for patient_id, group in df.groupby('pat_id_p'):
        delivery_date = group.iloc[0]['delivery_dt']
        if pd.isna(delivery_date):
            # no delivery date -> default to Vaginal for this patient
            df.loc[df['pat_id_p'] == patient_id, 'type_of_delivery'] = "Vaginal"
            continue

        window_start = delivery_date - pd.Timedelta(days=10)
        window_end   = delivery_date + pd.Timedelta(days=10)

        in_window = group['from_dt'].between(window_start, window_end)

        has_vag = (group.loc[in_window, 'any_vag_code']).any()
        has_ces = (group.loc[in_window, 'any_ces_code']).any()

        # Priority: Vaginal > Cesarean > (neither -> Vaginal)
        if has_vag:
            label = "Vaginal"
        elif has_ces:
            label = "Cesarean"
        else:
            label = "Vaginal"

        df.loc[df['pat_id_p'] == patient_id, 'type_of_delivery'] = label

    # clean up helper cols
    df.drop(columns=['any_vag_code', 'any_ces_code'], inplace=True, errors='ignore')
    return df


def process_data_sequentially(vaginal_codes, cesarean_codes):
    total_rows = sum(1 for _ in open(input_file_path)) - 1
    chunks = int(np.ceil(total_rows / chunk_size))
    start_time = time.time()

    with pd.read_csv(input_file_path, chunksize=chunk_size) as reader:
        for chunk_idx, chunk_df in enumerate(reader):
            print(f"\nProcessing chunk {chunk_idx + 1}/{chunks}...")

            # Convert dates early
            chunk_df['from_dt'] = pd.to_datetime(chunk_df['from_dt'], errors='coerce')
            chunk_df['delivery_dt'] = pd.to_datetime(chunk_df['delivery_dt'], errors='coerce')

            processed_chunk = separate_type_delivery(chunk_df, vaginal_codes, cesarean_codes)

            # NOTE: We no longer drop 'Both' since it is never assigned now.
            # Keep all rows with assigned type.
            # (Optional) If you only want rows where a type was assigned:
            processed_chunk = processed_chunk[processed_chunk['type_of_delivery'].notna()]

            mode = 'a' if chunk_idx > 0 else 'w'
            processed_chunk.to_csv(output_file_path, mode=mode, index=False, header=(chunk_idx == 0))

            elapsed = time.time() - start_time
            rem_time = (elapsed / (chunk_idx + 1)) * (chunks - chunk_idx - 1)
            h, r = divmod(rem_time, 3600)
            m, s = divmod(r, 60)
            progress = ((chunk_idx + 1) / chunks) * 100

            print(f"Chunk {chunk_idx + 1}/{chunks} processed.")
            print(f"Estimated remaining: {int(h):02}:{int(m):02}:{int(s):02}")
            print(f"Progress: {progress:.2f}%")

    total_elapsed = time.time() - start_time
    h, r = divmod(total_elapsed, 3600)
    m, s = divmod(r, 60)
    print(f"\nTotal processing time: {int(h):02}:{int(m):02}:{int(s):02})")


# Run
process_data_sequentially(vaginal_codes, cesarean_codes)

