In [8]:
import warnings
warnings.filterwarnings('ignore')


In [104]:
import json
import numpy as np
import pandas as pd
from datetime import datetime

def load_patient_data_from_json(file_path):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)

    processed_data = {}
    
    for patient_id, patient_info in data.items():
        processed_data[patient_id] = {}
        
        for key, value in patient_info.items():
            if isinstance(value, list):
                # Convert lists to DataFrame, even if the list is empty
                if len(value) > 0 and isinstance(value[0], dict):
                    time_series_df = pd.DataFrame(value)
                    if 'charttime' in time_series_df.columns:
                        time_series_df['charttime'] = pd.to_datetime(time_series_df['charttime'])
                    processed_data[patient_id][key] = time_series_df
                else:
                    # Empty list converted to an empty DataFrame with standard columns
                    processed_data[patient_id][key] = pd.DataFrame(columns=['charttime', 'valuenum', 'valueuom'])
            else:
                # Handle single values, converting `None` to `np.nan`
                processed_data[patient_id][key] = np.nan if value is None else value

    return processed_data

# Usage example:
positive_patients_data = load_patient_data_from_json('C:/Users/Rui/Desktop/Data/positive_patients_data.json')
negative_patients_data = load_patient_data_from_json('C:/Users/Rui/Desktop/Data/negative_patients_data.json')


In [107]:
for key in positive_patients_data.keys():
    print(type(positive_patients_data[key]['pH']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pand

In [101]:
normal_range_dict_1 = {
    'heart_rate': (20, 250),                          # bpm
    'body_temperature': (27, 45),                     # °C
    'SpO2': (50, 100),                                # %
    'pressure_sys': (50, 300),                        # mmHg
    'pressure_dias': (30, 180),                       # mmHg
    'pressure_mean': (40, 230),                       # mmHg
    'respiration_rate': (4, 65),                      # breaths/min
    'bilirubin_total': (0, 50),                       # mg/dL
    'anion_gap': (1, 30),                             # mEq/L
    'albumin': (1, 6),                                # g/dL
    'serum_chloride': (70, 130),                      # mEq/L
    'serum_potassium': (1.5, 10),                     # mEq/L
    'serum_sodium': (100, 180),                       # mEq/L
    'serum_lactate': (0, 20),                         # mmol/L
    'serum_magnesium': (0.5, 5),                      # mg/dL
    'PTT': (15, 180),                                 # seconds
    'PT': (5, 60),                                    # seconds
    'BUN': (2, 150),                                  # mg/dL
    'serum_creatinine': (0.1, 15),                    # mg/dL
    'hematocrit': (10, 70),                           # %
    'hemoglobin': (3, 25),                            # g/dL
    'platelets': (10, 2000),                          # K/uL
    'WBC': (0.5, 200),                                # K/uL
    'RBC': (1, 10),                                   # m/uL
    'serum_bicarbonate': (5, 50),                     # mEq/L
    'pH': (6.8, 7.8)                                  # pH units
}

normal_range_dict_2 = {
    'heart_rate': (20, 250),           # beats per minute (bpm)
    'body_temperature': (27, 45),      # degrees Celsius (°C)
    'SpO2': (50, 100),                 # percentage (%)
    'pressure_sys': (50, 300),         # systolic blood pressure in mmHg
    'pressure_dias': (30, 180),        # diastolic blood pressure in mmHg
    'pressure_mean': (40, 230),        # mean blood pressure in mmHg
    'respiration_rate': (4, 65),       # breaths per minute
    'bilirubin_total': (0, 50),        # milligrams per deciliter (mg/dL)
    'anion_gap': (0, 40),              # milliequivalents per liter (mEq/L)
    'albumin': (1.0, 6.0),             # grams per deciliter (g/dL)
    'serum_chloride': (70, 130),       # milliequivalents per liter (mEq/L)
    'serum_potassium': (1.5, 9.0),     # milliequivalents per liter (mEq/L)
    'serum_sodium': (110, 170),        # milliequivalents per liter (mEq/L)
    'serum_lactate': (0, 15),          # millimoles per liter (mmol/L)
    'serum_magnesium': (0.5, 6.0),     # milligrams per deciliter (mg/dL)
    'PTT': (15, 150),                  # seconds
    'PT': (8, 50),                     # seconds
    'BUN': (2, 100),                   # milligrams per deciliter (mg/dL)
    'serum_creatinine': (0.1, 10),     # milligrams per deciliter (mg/dL)
    'hematocrit': (10, 70),            # percentage (%)
    'hemoglobin': (4.0, 22.0),         # grams per deciliter (g/dL)
    'platelets': (10, 1000),           # thousands per microliter (K/uL)
    'WBC': (1, 100),                   # thousands per microliter (K/uL)
    'RBC': (2, 7),                     # millions per microliter (m/uL)
    'serum_bicarbonate': (5, 45),      # milliequivalents per liter (mEq/L)
    'pH': (6.8, 7.8),                  # pH units
}


In [108]:
test_df = positive_patients_data.copy()

In [109]:
test_items = normal_range_dict_1.keys()
for key in positive_patients_data.keys():
  for item in test_items:
    lower_limit, upper_limit  = normal_range_dict_1[item]
    if not positive_patients_data[key][item].empty:
      df = positive_patients_data[key][item]
      positive_patients_data[key][item] = df[(df['valuenum'] >= lower_limit) &
                                             (df['valuenum'] >= lower_limit)]


In [110]:
for key in negative_patients_data.keys():
  for item in test_items:
    lower_limit, upper_limit  = normal_range_dict_1[item]
    if not negative_patients_data[key][item].empty:
      df = negative_patients_data[key][item]
      negative_patients_data[key][item] = df[(df['valuenum'] >= lower_limit) &
                                             (df['valuenum'] >= lower_limit)]

In [111]:
def convert_complex_types(o):
    # Handle numpy types
    if isinstance(o, (np.int64, np.int32)):
        return int(o)
    if isinstance(o, (np.float64, np.float32)):
        return float(o)
    # Handle pandas DataFrame, converting to a dictionary
    if isinstance(o, pd.DataFrame):
        return o.to_dict(orient='records')
    # Handle pandas Timestamp
    if isinstance(o, pd.Timestamp):
        return o.isoformat()  # Convert Timestamp to ISO format string
    raise TypeError(f"Object of type {type(o)} is not JSON serializable")

# Save positive_patients_data separately
with open('processed_positive_patients_data.json', 'w') as json_file:
    json.dump(positive_patients_data, json_file, default=convert_complex_types, indent=4)

# Save negative_patients_data separately
with open('processed_negative_patients_data.json', 'w') as json_file:
    json.dump(negative_patients_data, json_file, default=convert_complex_types, indent=4)

print("Processed Positive and negative patients data stored separately as JSON files.")


Processed Positive and negative patients data stored separately as JSON files.


# Process into .csv

In [112]:
# Here's the Python code that processes all records from a JSON structure
# and converts it into a DataFrame with statistics for list-based entries
# (like heart_rate, SpO2, etc.), and non-list entries directly copied.

import pandas as pd
import numpy as np

import json

# Path to your JSON file in Google Colab
file_path = 'C:/Users/Rui/Desktop/Data/processed_positive_patients_data.json'

# Load the JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

# Function to compute stats (max, min, mean, median)
def compute_stats(values):
    if values:
        series = pd.Series(values)
        return series.max(), series.min(), series.mean(), series.median()
    return np.nan, np.nan, np.nan, np.nan

# Create an empty list to store rows
rows = []

# Process each stay_id in the data
for stay_id, patient_data in data.items():
    row = {'stay_id': stay_id}

    # Copy over simple fields (non-list values)
    for key, value in patient_data.items():
        if not isinstance(value, list):
            row[key] = value
        else:
            # For lists, calculate stats if there are numeric values
            if value:
                values = [entry['valuenum'] for entry in value if 'valuenum' in entry]
                max_val, min_val, mean_val, median_val = compute_stats(values)
                row[f"{key}_max"] = max_val
                row[f"{key}_min"] = min_val
                row[f"{key}_mean"] = mean_val
                row[f"{key}_median"] = median_val
            else:
                row[f"{key}_max"] = np.nan
                row[f"{key}_min"] = np.nan
                row[f"{key}_mean"] = np.nan
                row[f"{key}_median"] = np.nan

    # Append the row to the list
    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Save the DataFrame to a CSV file
output_path = 'C:/Users/Rui/Desktop/Data/processed_positive_patients_data.csv'
df.to_csv(output_path, index=False)

# Output the path of the saved file
output_path

# Function to compute stats (max, min, mean, median)
def compute_stats(values):
    if values:
        series = pd.Series(values)
        return series.max(), series.min(), series.mean(), series.median()
    return np.nan, np.nan, np.nan, np.nan

# Create an empty list to store rows
rows = []

# Process each stay_id in the data
for stay_id, patient_data in data.items():
    row = {'stay_id': stay_id}

    # Copy over simple fields (non-list values)
    for key, value in patient_data.items():
        if not isinstance(value, list):
            row[key] = value
        else:
            # For lists, calculate stats if there are numeric values
            if value:
                values = [entry['valuenum'] for entry in value if 'valuenum' in entry]
                max_val, min_val, mean_val, median_val = compute_stats(values)
                row[f"{key}_max"] = max_val
                row[f"{key}_min"] = min_val
                row[f"{key}_mean"] = mean_val
                row[f"{key}_median"] = median_val
            else:
                row[f"{key}_max"] = np.nan
                row[f"{key}_min"] = np.nan
                row[f"{key}_mean"] = np.nan
                row[f"{key}_median"] = np.nan

    # Append the row to the list
    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Return the final DataFrame
df


Unnamed: 0,stay_id,gender,age,race,height,weight,BMI,heart_rate_max,heart_rate_min,heart_rate_mean,...,CPD,diabetes,liver_disease,PUD,PVD,MI,CD,hiv,CDK,cancer
0,38875437,F,81,WHITE,,43.0,,120.0,93.0,105.207317,...,0,0,0,0,0,0,0,0,0,0
1,39635619,M,51,UNKNOWN,180.0,112.5,34.722222,107.0,82.0,95.352941,...,0,0,0,0,0,0,0,0,0,0
2,37005236,F,72,WHITE,163.0,63.1,23.749482,111.0,79.0,87.833333,...,0,0,0,0,0,0,0,0,0,0
3,32769810,F,70,BLACK/AFRICAN,,69.9,,114.0,101.0,106.750000,...,0,0,0,0,0,0,0,0,0,0
4,34107647,F,70,WHITE,163.0,96.0,36.132335,124.0,96.0,106.428571,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2146,32959861,F,78,UNKNOWN,160.0,57.0,22.265625,109.0,85.0,95.785714,...,0,0,0,0,0,0,0,0,0,0
2147,31629173,M,76,WHITE,180.0,134.5,41.512346,90.0,72.0,79.500000,...,0,0,0,0,0,0,0,0,0,0
2148,32568753,F,82,WHITE,,63.1,,102.0,85.0,95.846154,...,0,0,0,0,0,0,0,0,0,0
2149,32641669,F,72,BLACK/AFRICAN AMERICAN,,77.1,,81.0,59.0,70.440000,...,0,0,0,0,0,0,0,0,0,0


In [113]:
# Here's the Python code that processes all records from a JSON structure
# and converts it into a DataFrame with statistics for list-based entries
# (like heart_rate, SpO2, etc.), and non-list entries directly copied.

import pandas as pd
import numpy as np

import json

# Path to your JSON file in Google Colab
file_path = 'C:/Users/Rui/Desktop/Data/processed_negative_patients_data.json'

# Load the JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

# Function to compute stats (max, min, mean, median)
def compute_stats(values):
    if values:
        series = pd.Series(values)
        return series.max(), series.min(), series.mean(), series.median()
    return np.nan, np.nan, np.nan, np.nan

# Create an empty list to store rows
rows = []

# Process each stay_id in the data
for stay_id, patient_data in data.items():
    row = {'stay_id': stay_id}

    # Copy over simple fields (non-list values)
    for key, value in patient_data.items():
        if not isinstance(value, list):
            row[key] = value
        else:
            # For lists, calculate stats if there are numeric values
            if value:
                values = [entry['valuenum'] for entry in value if 'valuenum' in entry]
                max_val, min_val, mean_val, median_val = compute_stats(values)
                row[f"{key}_max"] = max_val
                row[f"{key}_min"] = min_val
                row[f"{key}_mean"] = mean_val
                row[f"{key}_median"] = median_val
            else:
                row[f"{key}_max"] = np.nan
                row[f"{key}_min"] = np.nan
                row[f"{key}_mean"] = np.nan
                row[f"{key}_median"] = np.nan

    # Append the row to the list
    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Save the DataFrame to a CSV file
output_path = 'C:/Users/Rui/Desktop/Data/processed_negative_patients_data.csv'
df.to_csv(output_path, index=False)

# Output the path of the saved file
output_path

# Function to compute stats (max, min, mean, median)
def compute_stats(values):
    if values:
        series = pd.Series(values)
        return series.max(), series.min(), series.mean(), series.median()
    return np.nan, np.nan, np.nan, np.nan

# Create an empty list to store rows
rows = []

# Process each stay_id in the data
for stay_id, patient_data in data.items():
    row = {'stay_id': stay_id}

    # Copy over simple fields (non-list values)
    for key, value in patient_data.items():
        if not isinstance(value, list):
            row[key] = value
        else:
            # For lists, calculate stats if there are numeric values
            if value:
                values = [entry['valuenum'] for entry in value if 'valuenum' in entry]
                max_val, min_val, mean_val, median_val = compute_stats(values)
                row[f"{key}_max"] = max_val
                row[f"{key}_min"] = min_val
                row[f"{key}_mean"] = mean_val
                row[f"{key}_median"] = median_val
            else:
                row[f"{key}_max"] = np.nan
                row[f"{key}_min"] = np.nan
                row[f"{key}_mean"] = np.nan
                row[f"{key}_median"] = np.nan

    # Append the row to the list
    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Return the final DataFrame
df

Unnamed: 0,stay_id,gender,age,race,height,weight,BMI,heart_rate_max,heart_rate_min,heart_rate_mean,...,CPD,diabetes,liver_disease,PUD,PVD,MI,CD,hiv,CDK,cancer
0,37510196,F,77,BLACK/AFRICAN AMERICAN,157.0,65.0,26.370238,80.0,38.0,71.392857,...,0,0,0,0,0,0,0,0,0,0
1,31090461,F,82,WHITE,,48.0,,107.0,68.0,84.920000,...,0,0,0,0,0,0,0,0,0,0
2,32610785,F,77,WHITE,,41.6,,84.0,53.0,64.880000,...,0,0,0,0,0,0,0,0,0,1
3,35479615,F,81,WHITE,,48.4,,109.0,67.0,89.259259,...,0,0,0,0,0,0,0,0,0,0
4,35044219,M,53,WHITE,178.0,156.1,49.267769,150.0,73.0,93.928571,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4837,32880062,M,47,UNKNOWN,157.0,70.0,28.398718,86.0,72.0,79.625000,...,0,0,0,0,0,0,0,0,0,0
4838,37403074,M,72,WHITE,,62.2,,99.0,81.0,91.708333,...,0,0,0,0,0,0,0,0,0,0
4839,31417783,F,72,BLACK/AFRICAN AMERICAN,,77.6,,79.0,66.0,71.400000,...,0,0,0,0,0,0,0,0,0,0
4840,30988867,M,45,UNKNOWN,180.0,90.0,27.777778,120.0,43.0,86.272727,...,0,0,0,0,0,0,0,0,0,0
