In [1]:
# import libraries
import os
import pandas as pd
import glob

# define folder paths
PLT_DATA_PATH = "./Sepsis_plt/plt/PLT_TRUE/reduction_rate/"
PLT_LIST_PATH = "./Sepsis_plt/Sepsis plt/"
HN_FILE_PATH = "./Sepsis_plt/Sepsis plt/plt_ASEhn_ALL.txt"
DEMO_DATA_PATH = "./Sepsis_plt/demographic/"

In [2]:
def merge_SIC_HN(): # merge SIC HN lists
    files = glob.glob(PLT_LIST_PATH + "plt_ASEhn_p*.txt")
    if not files:
        print(f"No files found in {PLT_LIST_PATH}.")
        return
    
    all_lines = []
    for file in files:
        try:
            with open(file, 'r') as f:
                lines = f.readlines()
                all_lines.extend(lines)
                print(f"Read {len(lines)} lines from {file}.")
        except Exception as e:
            print(f"Error reading {file}: {e}")
    
    output_file = os.path.join(PLT_LIST_PATH, "plt_ASEhn_ALL.txt")
    with open(output_file, 'w') as f:
        for line in all_lines:
            f.write(line)
    print(f"Merged {len(files)} files into {output_file} with {len(all_lines)} total lines.")

In [3]:
def merge_plt_lab(test): # merge ASE patients lab data by year
    LAB_DATA_PATH = "./Sepsis_plt/All sepsis/allSepsis/" # not 25 ver
    OUTPUT_PATH = f"./Sepsis_plt/by_year/{test}/"
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    # match file names according to lab test
    match test:
        case "plt":
            LAB_DATA_PATH = PLT_DATA_PATH
            pattern = "*_Sepsis_PresumedInfection_Bili_Plt_"
        case "crea":
            pattern = "*_Sepsis_PresumedInfection_Crea_"
        case "bili":
            pattern = "*_Sepsis_PresumedInfection_Bili_Plt_"
        case "resp":
            pattern = "*_Sepsis_PresumedInfection_Procedure_"
        case "lact":
            pattern = "*_Sepsis_PresumedInfection_Lactate_"
    
    mdfs = []
    for year in range(2001, 2024):
        print(f"Processing year: {year}")
        files = glob.glob(LAB_DATA_PATH + f"{pattern}{year}*.csv")
        if not files:
            print(f"No files found for year {year} with pattern {pattern}")
            continue
        print(f"Found {len(files)} files for year {year}")
        dfs = []
        for file in files:
            try:
                df = pd.read_csv(file)
                if 'Year' not in df.columns: # add year column for identification
                    df['Year'] = year
                dfs.append(df)
                mdfs.append(df)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
            output_file = os.path.join(OUTPUT_PATH, f"{test}_{year}.csv")
            combined_df.to_csv(output_file, index=False)
            print(f"Saved combined data for year {year} to {output_file}, {len(combined_df)} total rows")

    master_df = pd.concat(mdfs, ignore_index=True) # combine all years
    output_file = os.path.join(OUTPUT_PATH, f"{test}_all.csv")
    master_df.to_csv(output_file, index=False)
    print(f"Saved master data to {output_file}, {len(master_df)} total rows")

In [4]:
def filter_SIC(test): # filter SIC patients lab data
    LAB_DATA_PATH = f"./Sepsis_plt/by_year/{test}/{test}_all.csv"
    OUTPUT_PATH = f"./Sepsis_plt/by_year/{test}/"
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    if not os.path.exists(LAB_DATA_PATH):
        print(f"Lab data file {LAB_DATA_PATH} does not exist.")
        return
    if not os.path.exists(HN_FILE_PATH):
        print(f"HN list {HN_FILE_PATH} does not exist.")
        return
    
    try: # retrieve target HN numbers
        with open(HN_FILE_PATH, 'r') as f:
            lines = f.readlines()
            target_HN_numbers = set()
            for line in lines:
                line = line.strip()
                if line and "," in line:
                    HN_number = line.split(",")[-1].strip()
                    if HN_number:
                        target_HN_numbers.add(HN_number)
    except Exception as e:
        print(f"Error reading HN file: {e}")
        return

    try:
        df = pd.read_csv(LAB_DATA_PATH)
        original_rows = len(df)
        if 'HN Number' in df.columns:
            df['HN Number'] = df['HN Number'].astype(str)
            filtered_df = df[df['HN Number'].isin(target_HN_numbers)]
            output_file = os.path.join(OUTPUT_PATH, f"{test}_SIC_all.csv")
            filtered_df.to_csv(output_file, index=False)
            filtered_rows = len(filtered_df)
            print(f"Filtered data saved to {output_file}, {filtered_rows} rows out of {original_rows} original rows.")
    except Exception as e:
        print(f"Error reading lab data file: {e}")
        return

In [9]:
def merge_demographic(): # merge demographic data by year
    OUTPUT_PATH = "./Sepsis_plt/by_year/demographic/"
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    pattern = "*_presumedInfection_Demographic_"
    mdfs = []
    for year in range(2001, 2024):
        print(f"Processing year: {year}")
        files = glob.glob(DEMO_DATA_PATH + f"{pattern}{year}*.csv")
        if not files:
            print(f"No files found for year {year} with pattern {pattern}")
            continue
        print(f"Found {len(files)} files for year {year}")
        dfs = []
        for file in files:
            try:
                df = pd.read_csv(file)
                if 'Year' not in df.columns: # add year column for identification
                    df['Year'] = year
                dfs.append(df)
                mdfs.append(df)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
            output_file = os.path.join(OUTPUT_PATH, f"demographic_{year}.csv")
            combined_df.to_csv(output_file, index=False)
            print(f"Saved combined data for year {year} to {output_file}, {len(combined_df)} total rows")

    master_df = pd.concat(mdfs, ignore_index=True) # combine all years
    output_file = os.path.join(OUTPUT_PATH, f"demographic_all.csv")
    master_df.to_csv(output_file, index=False)
    print(f"Saved master data to {output_file}, {len(master_df)} total rows")


In [17]:
def filter_demographic_SIC(): # filter SIC patients demographic data
    DEMO_DATA_PATH = "./Sepsis_plt/by_year/demographic/demographic_all.csv"
    OUTPUT_PATH = "./Sepsis_plt/by_year/demographic/"
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    
    if not os.path.exists(HN_FILE_PATH):
        print(f"HN list {HN_FILE_PATH} does not exist.")
        return
    if not os.path.exists(DEMO_DATA_PATH):
        print(f"Demographic data file do not exist.")
        return
    
    try: # retrieve target HN numbers
        with open(HN_FILE_PATH, 'r') as f:
            lines = f.readlines()
            target_HN_numbers = set()
            for line in lines:
                line = line.strip()
                if line and "," in line:
                    HN_number = line.split(",")[-1].strip()
                    if HN_number:
                        target_HN_numbers.add(HN_number)
    except Exception as e:
        print(f"Error reading HN file: {e}")
        return
    
    try:
        df = pd.read_csv(DEMO_DATA_PATH)
        original_rows = len(df)
        if 'HN Number' in df.columns:
            df['HN Number'] = df['HN Number'].astype(str)
            filtered_df = df[df['HN Number'].isin(target_HN_numbers)]
            output_file = os.path.join(OUTPUT_PATH, f"demographic_SIC_all.csv")
            filtered_df.to_csv(output_file, index=False)
            filtered_rows = len(filtered_df)
            print(f"Filtered data saved to {output_file}, {filtered_rows} rows out of {original_rows} original rows.")
    except Exception as e:
        print(f"Error reading demographic data file: {e}")
        return

In [7]:
merge_SIC_HN()

Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p1.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p10.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p11.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p12.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p13.txt.
Read 3722 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p14.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p2.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p3.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p4.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p5.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p6.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p7.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p8.txt.
Read 5000 lines from ./Sepsis_plt/Sepsis plt\plt_ASEhn_p9.txt.
Merged 14 files into ./Sepsis_plt/Sepsis plt/plt_ASEhn_ALL.txt with 68722 total lines.


In [None]:
test = "plt" # lab data type: plt, crea, bili, resp, lact
merge_plt_lab(test)
filter_SIC(test)

Processing year: 2001
Found 4 files for year 2001
Saved combined data for year 2001 to ./Sepsis_plt/plt_year/plt/plt_2001.csv, 57760 total rows
Processing year: 2002
Found 6 files for year 2002
Saved combined data for year 2002 to ./Sepsis_plt/plt_year/plt/plt_2002.csv, 72362 total rows
Processing year: 2003
Found 12 files for year 2003
Saved combined data for year 2003 to ./Sepsis_plt/plt_year/plt/plt_2003.csv, 245805 total rows
Processing year: 2004
Found 15 files for year 2004
Saved combined data for year 2004 to ./Sepsis_plt/plt_year/plt/plt_2004.csv, 271750 total rows
Processing year: 2005
Found 15 files for year 2005
Saved combined data for year 2005 to ./Sepsis_plt/plt_year/plt/plt_2005.csv, 270784 total rows
Processing year: 2006
Found 14 files for year 2006
Saved combined data for year 2006 to ./Sepsis_plt/plt_year/plt/plt_2006.csv, 267539 total rows
Processing year: 2007
Found 14 files for year 2007
Saved combined data for year 2007 to ./Sepsis_plt/plt_year/plt/plt_2007.csv, 

In [10]:
merge_demographic()
filter_demographic_SIC()

Processing year: 2001
Found 4 files for year 2001
Saved combined data for year 2001 to ./Sepsis_plt/by_year/demographic/demographic_2001.csv, 62826 total rows
Processing year: 2002
Found 9 files for year 2002
Saved combined data for year 2002 to ./Sepsis_plt/by_year/demographic/demographic_2002.csv, 153242 total rows
Processing year: 2003
Found 12 files for year 2003
Saved combined data for year 2003 to ./Sepsis_plt/by_year/demographic/demographic_2003.csv, 181091 total rows
Processing year: 2004
Found 14 files for year 2004
Saved combined data for year 2004 to ./Sepsis_plt/by_year/demographic/demographic_2004.csv, 199944 total rows
Processing year: 2005
Found 15 files for year 2005
Saved combined data for year 2005 to ./Sepsis_plt/by_year/demographic/demographic_2005.csv, 215447 total rows
Processing year: 2006
Found 14 files for year 2006
Saved combined data for year 2006 to ./Sepsis_plt/by_year/demographic/demographic_2006.csv, 211911 total rows
Processing year: 2007
Found 14 files 

  df = pd.read_csv(file)


Saved combined data for year 2010 to ./Sepsis_plt/by_year/demographic/demographic_2010.csv, 673596 total rows
Processing year: 2011
Found 18 files for year 2011


  df = pd.read_csv(file)


Saved combined data for year 2011 to ./Sepsis_plt/by_year/demographic/demographic_2011.csv, 581678 total rows
Processing year: 2012
Found 19 files for year 2012
Saved combined data for year 2012 to ./Sepsis_plt/by_year/demographic/demographic_2012.csv, 574000 total rows
Processing year: 2013
Found 19 files for year 2013
Saved combined data for year 2013 to ./Sepsis_plt/by_year/demographic/demographic_2013.csv, 509135 total rows
Processing year: 2014
Found 23 files for year 2014
Saved combined data for year 2014 to ./Sepsis_plt/by_year/demographic/demographic_2014.csv, 572146 total rows
Processing year: 2015
Found 24 files for year 2015
Saved combined data for year 2015 to ./Sepsis_plt/by_year/demographic/demographic_2015.csv, 575807 total rows
Processing year: 2016
Found 25 files for year 2016


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


Saved combined data for year 2016 to ./Sepsis_plt/by_year/demographic/demographic_2016.csv, 627617 total rows
Processing year: 2017
Found 27 files for year 2017


  df = pd.read_csv(file)


Saved combined data for year 2017 to ./Sepsis_plt/by_year/demographic/demographic_2017.csv, 748066 total rows
Processing year: 2018
Found 28 files for year 2018


  df = pd.read_csv(file)
  df = pd.read_csv(file)


Saved combined data for year 2018 to ./Sepsis_plt/by_year/demographic/demographic_2018.csv, 706258 total rows
Processing year: 2019
Found 34 files for year 2019
Saved combined data for year 2019 to ./Sepsis_plt/by_year/demographic/demographic_2019.csv, 733215 total rows
Processing year: 2020
Found 29 files for year 2020
Saved combined data for year 2020 to ./Sepsis_plt/by_year/demographic/demographic_2020.csv, 652867 total rows
Processing year: 2021
Found 31 files for year 2021
Saved combined data for year 2021 to ./Sepsis_plt/by_year/demographic/demographic_2021.csv, 690889 total rows
Processing year: 2022
Found 30 files for year 2022
Saved combined data for year 2022 to ./Sepsis_plt/by_year/demographic/demographic_2022.csv, 774238 total rows
Processing year: 2023
Found 31 files for year 2023
Saved combined data for year 2023 to ./Sepsis_plt/by_year/demographic/demographic_2023.csv, 653620 total rows
Saved master data to ./Sepsis_plt/by_year/demographic/demographic_all.csv, 11144530 t