In This file we check for teh the 90 days of claim information prior to delivery. 

Input the  pregnancy_processed.csv and the final putput is 90day_day_filter.csv

In [2]:
#### Libaries 
import pandas as pd
import numpy as np
from pandas._libs.tslibs.parsing import DateParseError
import time
import warnings
warnings.filterwarnings('ignore')
import os

In [3]:
# Function to process the chunk and filter based on 90-day pregnancy information
def create_pregnancydx_and_filter_90days(df, pregnancy_codes):
    diag_cols = [f'diag{i}' for i in range(1, 13)]  # Diagnoses columns

    # Check if any diagnosis matches the pregnancy codes
    df['any_diag_in_dx_codes'] = df[diag_cols].isin(pregnancy_codes).any(axis=1)

    # Initialize the pregnancy_dx_dt column
    df['pregnancy_dx_dt'] = pd.NaT

    # Process each patient group by pat_id_p
    result_df = pd.DataFrame()
    excluded_patients = pd.DataFrame()

    for pat_id, group_data in df.groupby('pat_id_p'):
        selected_rows = group_data[group_data['any_diag_in_dx_codes']].sort_values(by='from_dt')
        select_date = None
        for _, row in selected_rows.iterrows():
            delivery_date = group_data.iloc[0]['delivery_dt']
            if (pd.to_datetime(delivery_date) - pd.to_datetime(row['from_dt'])).days <= 210:
                select_date = row['from_dt']
                break

        group_data['pregnancy_dx_dt'] = select_date

        # Apply the 90-day filter
        group_data['days_between'] = (pd.to_datetime(group_data['delivery_dt']) - pd.to_datetime(group_data['pregnancy_dx_dt'])).dt.days
        group_data['90_day_preg_info'] = (group_data['days_between'] >= 90).astype(int)

        # Separate patients with complete and incomplete info
        if group_data['90_day_preg_info'].sum() > 0:
            result_df = pd.concat([result_df, group_data])
        else:
            excluded_patients = pd.concat([excluded_patients, group_data])

    return result_df, excluded_patients

# Function to rerun the processing for excluded patients and try to include them
def rerun_excluded_patients(file_path, pregnancy_codes, output_file):
    if os.path.exists(file_path):
        print(f"Reprocessing excluded patients from {file_path}...")
        excluded_df = pd.read_csv(file_path)
        
        reprocessed_df = create_pregnancydx_and_filter_90days(excluded_df, pregnancy_codes)[0]  # Only process for valid pregnancies
        if not reprocessed_df.empty:
            reprocessed_df.to_csv(output_file, mode='a', index=False, header=False)
            print(f"Reprocessed {len(reprocessed_df)} excluded patients and added them to the output file.")
        else:
            print("No excluded patients were eligible after reprocessing.")
    else:
        print("No excluded patients file found for reprocessing.")



In [None]:

input_file_path = "Z:/chelsea/datalake/New_cohort/pregnancy_processed.csv"
output_file_path = "Z:/chelsea/datalake/New_cohort/90day_day_filter.csv"  # Output for 90-day valid patients
excluded_patients_file = "Z:/chelsea/datalake/New_cohort/90_day_excluded_patients.csv"  # File for excluded patients

# Define chunk size
chunk_size = 300000
# Load the pregnancy codes
pregnancy_codes_df = pd.read_csv("Z:/chelsea/datalake/final_codes/pregnancy_codes.csv")
pregnancy_codes = pregnancy_codes_df['dx_cd'].tolist()

# Initialize tracking variables
start_time = time.time()

# Check if the output file already exists and determine where to resume
if os.path.exists(output_file_path):
    processed_df = pd.read_csv(output_file_path, usecols=['pat_id_p'])
    last_processed_patients = processed_df['pat_id_p'].nunique()
    processed_chunks = len(processed_df) // chunk_size
    print(f"Resuming from last processed chunk {processed_chunks}. Already processed {last_processed_patients} patients.")
else:
    processed_chunks = 0
    print("No existing output file found. Starting from scratch.")

# Calculate the total number of rows (for estimation) and total chunks
total_rows = sum(1 for _ in open(input_file_path))
total_chunks = total_rows // chunk_size + 1  # Calculate total chunks

# Create a loop to read and process data in chunks
with pd.read_csv(input_file_path, chunksize=chunk_size, low_memory=False) as reader:
    for chunk_idx, chunk in enumerate(reader):
        if chunk_idx < processed_chunks:
            continue  # Skip already processed chunks

        print(f"\nProcessing chunk {chunk_idx + 1} out of {total_chunks}")

        try:
            # Process the chunk using the filtering logic
            processed_chunk, excluded_chunk = create_pregnancydx_and_filter_90days(chunk, pregnancy_codes)

            # Write the processed chunk to the output file
            if chunk_idx == processed_chunks:
                processed_chunk.to_csv(output_file_path, mode='w', index=False)  # Write header if first chunk
            else:
                processed_chunk.to_csv(output_file_path, mode='a', index=False, header=False)  # Append without header

            # Write excluded patients to the excluded file
            if not excluded_chunk.empty:
                excluded_chunk.to_csv(excluded_patients_file, mode='a', index=False, header=not os.path.exists(excluded_patients_file))

            # Time tracking and progress report
            elapsed_time = time.time() - start_time
            avg_time_per_chunk = elapsed_time / (chunk_idx + 1)
            remaining_chunks = total_chunks - (chunk_idx + 1)
            estimated_time_remaining = avg_time_per_chunk * remaining_chunks

            print(f"Processed chunk {chunk_idx + 1}/{total_chunks}")
            print(f"Time elapsed: {elapsed_time:.2f} seconds")
            print(f"Estimated time remaining: {estimated_time_remaining:.2f} seconds")

        except Exception as e:
            print(f"Error processing chunk {chunk_idx + 1}: {e}")
            continue

# After processing all chunks, rerun the excluded patients and try to include them
rerun_excluded_patients(excluded_patients_file, pregnancy_codes, output_file_path)

# Final time report
total_elapsed_time = time.time() - start_time
print(f"Total processing time: {total_elapsed_time:.2f} seconds.")





