In [2]:
## libaries
import pandas as pd
import numpy as np
import os
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
# inputs and output file paths
input_file = r"\\chcdfiles.uthouston.edu\extract\vivas_postpartum\chelsea\datalake\New_cohort\New_filters\Cohort\90day_day_filter.csv"
output_file = r"\\chcdfiles.uthouston.edu\extract\vivas_postpartum\chelsea\datalake\New_cohort\New_filters\Cohort\42day_post_delivery_filter.csv"


In [4]:
CHUNK_SIZE = 400000
FOLLOWUP_DAYS = 42

start_time = time.time()

# Remove output file if it already exists
if os.path.exists(output_file):
    os.remove(output_file)

# ------------------------
# Process file in chunks
# ------------------------
for chunk_idx, chunk_df in enumerate(
    pd.read_csv(input_file, chunksize=CHUNK_SIZE, low_memory=False)
):
    print(f"Processing chunk {chunk_idx + 1}")

    # ---- Ensure date columns are datetime ----
    chunk_df["delivery_dt"] = pd.to_datetime(chunk_df["delivery_dt"], errors="coerce")
    chunk_df["last_pat_dt"] = pd.to_datetime(chunk_df["last_pat_dt"], errors="coerce")

    # Drop rows with missing key dates
    chunk_df = chunk_df[
        chunk_df["delivery_dt"].notnull() &
        chunk_df["last_pat_dt"].notnull()
    ]

    # ---- Calculate days after delivery ----
    chunk_df["days_after_delivery"] = (
        chunk_df["last_pat_dt"] - chunk_df["delivery_dt"]
    ).dt.days

    # ---- Apply 42-day filter ----
    chunk_df = chunk_df.query("days_after_delivery >= @FOLLOWUP_DAYS")

    chunk_df.reset_index(drop=True, inplace=True)

    # ---- Write output ----
    if chunk_idx == 0:
        chunk_df.to_csv(output_file, index=False)
    else:
        chunk_df.to_csv(output_file, mode="a", index=False, header=False)

elapsed = time.time() - start_time
print(f"\n42-day post-delivery cohort created.")
print(f"Total processing time: {elapsed:.2f} seconds")
print(f"Output file: {output_file}")

Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processin