In [9]:
# import pandas as pd
# 
# # Step 1: Load necessary MIMIC data files
# admissions = pd.read_csv("path_to_admissions.csv")
# patients = pd.read_csv("path_to_patients.csv")
# icustays = pd.read_csv("path_to_icustays.csv")


import pandas as pd

import warnings

warnings.filterwarnings('ignore')
path = "J:\\mimic-iii-clinical-database-1.4\\mimic-iii-clinical-database-1.4"
# load Patients and Diagnoses_icd
patients = pd.read_csv(path + '\\patients.csv')
diagnoses = pd.read_csv(path + '\\diagnoses_icd.csv')
admissions = pd.read_csv(path + '\\admissions.csv')
icustays = pd.read_csv(path + '\\icustays.csv')


# Step 2: Filter patients diagnosed with 'sepsis', 'severe sepsis', or 'septic shock'
sepsis_terms = ["sepsis", "severe sepsis", "septic shock"]
admissions = admissions[admissions['DIAGNOSIS'].str.contains('|'.join(sepsis_terms), case=False, na=False)]
print(f"Total patients diagnosed with sepsis: {len(admissions)}")

# Step 3: Exclude patients with multiple admission records (keep the first admission only)
admissions = admissions.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
admissions = admissions.drop_duplicates(subset='SUBJECT_ID', keep='first')
print(f"Patients after excluding multiple admissions: {len(admissions)}")

# Step 4: Further exclusions
# Merge admissions with patients and ICU stays for exclusions
merged_data = admissions.merge(patients, on='SUBJECT_ID', how='left')
merged_data = merged_data.merge(icustays, on='HADM_ID', how='left')

# Exclude patients under 18 years old
merged_data = merged_data[merged_data['AGE'] >= 18]
print(f"Patients aged 18 or older: {len(merged_data)}")

# Exclude patients without demographic data
merged_data = merged_data.dropna(subset=['gender', 'ethnicity'])
print(f"Patients with demographic data: {len(merged_data)}")

# Exclude patients without SOFA, qSOFA, or SAPS-II scores
# Assuming these scores are in icustays.csv or a derived scores table
merged_data = merged_data.dropna(subset=['sofa_score', 'qsofa_score', 'saps_ii_score'])
print(f"Patients with SOFA, qSOFA, and SAPS-II scores: {len(merged_data)}")

# Exclude patients with >20% missing variables
threshold = 0.8  # At least 80% of the variables must be non-missing
merged_data = merged_data.dropna(thresh=int(threshold * len(merged_data.columns)))
print(f"Patients with <=20% missing variables: {len(merged_data)}")

# Step 5: Final cohort
final_data = merged_data
print(f"Final number of patients for analysis: {len(final_data)}")

# Step 6: Separate survival outcomes within 30 days
# Assuming 'death_within_30_days' is derived from admission or discharge data
final_data['outcome'] = final_data['death_within_30_days'].apply(lambda x: 'Death' if x == 1 else 'Survival')

# Split data into two groups
death_within_30_days = final_data[final_data['outcome'] == 'Death']
survival_within_30_days = final_data[final_data['outcome'] == 'Survival']

print(f"Patients who died within 30 days: {len(death_within_30_days)}")
print(f"Patients who survived within 30 days: {len(survival_within_30_days)}")


Total patients diagnosed with sepsis: 1831
Patients after excluding multiple admissions: 1677


KeyError: 'AGE'