# Creating groups of laboratory analyses

The table in `categorized_analyses.csv` was created with the help of a medical doctor.
We will use these groups of analyses to create more laboratory-data-based data sources.

In [2]:
import numpy as np
import pandas as pd

In [3]:
analyses = pd.read_csv("categorized_analyses.csv").drop("category_fr", axis=1).rename(columns={"category_en":"category"})

In [4]:
analyses

Unnamed: 0,label,itemid,event_count,category
0,Hematocrit,51221,3325700,hematology
1,Creatinine,50912,3282278,renal
2,Platelet Count,51265,3216656,hematology
3,Urea Nitrogen,51006,3189474,renal
4,Hemoglobin,51222,3188835,hematology
...,...,...,...,...
910,Voided Specimen,52313,1,
911,Young Cells,51459,1,
912,Young,52371,1,
913,Blasts,52294,1,


In [5]:
categories = analyses.category.unique()
categories, len(categories)

(array(['hematology', 'renal', 'metabolic', nan, 'hepatology', 'nutrition',
        'infectiology', 'cardiology', 'endocrine', 'muscular',
        'toxicology', 'reproduction', 'immunology_inflammation',
        'tumor_marker', 'body_fluids', 'pulmonary', 'hepatic_renal'],
       dtype=object),
 17)

In [51]:
MAX_ANALYSES_BY_GROUP = 20  # arbitrary, prevents groups with too many features
MIN_EVENT_COUNT = 7000  # prevents analyses with too few examples to be selected, set so that each group retains at least one

categorized_analyses = {
    category: group[group['event_count'] > MIN_EVENT_COUNT].iloc[:MAX_ANALYSES_BY_GROUP]
    for category, group in analyses.groupby('category')
}
categories = list(categorized_analyses.keys())

In [52]:
categorized_analyses["hepatic_renal"].sort_values("event_count", ascending=False)

Unnamed: 0,label,itemid,event_count,category
297,"Total Protein, Pleural",51059,7387,hepatic_renal


In [58]:
# Creating a table to display stats on each category
grouped_analyses = (
    analyses.drop("itemid", axis=1)
    .groupby("category")
    .agg({"event_count": "sum"})
).sort_values("event_count", ascending=False)

# Add the number of elements in each group
grouped_analyses["max_count"] = analyses.groupby("category").event_count.max()
grouped_analyses["min_count"] = analyses.groupby("category").event_count.min()
grouped_analyses["num_elements"] = analyses.groupby("category").size()

In [59]:
grouped_analyses

Unnamed: 0_level_0,event_count,max_count,min_count,num_elements
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hematology,49761099,3325700,1,123
metabolic,23960259,2972827,25,46
renal,18531524,3282278,14,43
hepatology,6575882,1348729,4,13
infectiology,5244004,682923,115,78
nutrition,1876534,749944,53116,6
toxicology,952582,88720,98,25
cardiology,682891,359000,163,6
endocrine,549928,316395,3180,9
muscular,280892,280892,280892,1


In [60]:
categorized_analyses_codes = {category: list(table["itemid"]) for category, table in categorized_analyses.items()}

In [61]:
import json

In [62]:
output_file = "categorized_analyses.json"

with open(output_file, "w") as f:
    for key, value in categorized_analyses_codes.items():
        json_line = json.dumps({key: value}, separators=(",", ": "), indent=None)
        f.write("    " + json_line[1:-1] + ",\n")  # Remove outer braces and append a new line

# Add final touch to create the JSON-like structure
with open(output_file, "r+") as f:
    lines = f.readlines()
    f.seek(0)
    f.write("{\n")  # Open brace
    f.writelines(lines[:-1])  # Write all lines except the last
    f.write(lines[-1].rstrip(",\n") + "\n}\n")  # Remove last comma, close brace