# Dependencies

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from utils import unique

In [None]:
data = pd.read_excel("/home/simjo484/Desktop/link_to_xml_data/MRI_summary_extended.xlsx")

print("Shape: ", data.shape, "\n")
print(data.columns)


# Notes
* gender: there are about 100.000 female, and 125.000 male, and less than 25.000 Not available.

* tumor_descriptor: Seemingly non-informative.

# Information about the pre-op patients

In [None]:
# Filter on the observations that are pre operation.
data_one = data[data["session_status"] == "pre_op"]

# Drop duplicates, one row per patient and diagnosis (potentially a patient could have two diagnoses?)
data_preop_noduplicates = data_one.drop_duplicates(subset=["subjetID", "diagnosis"])

# Find the diagnoses that have at least 18 patients
data_two = unique(data_preop_noduplicates["diagnosis"])
diagnoses = data_two[data_two["Counts"] >= 18]["Values"].tolist()

# Filter on diagnoses with at least 18 patients
data_one = data_one[data_one["diagnosis"].isin(diagnoses)]

# Print the number of each gender for each diagnose
for i in range(len(diagnoses)):
    counts = unique(data_preop_noduplicates[data_preop_noduplicates["diagnosis"] == diagnoses[i]]["gender"])

    print(diagnoses[i])
    print(counts)
    print("\n")

In [9]:
data_t2_and_adc = data[[img_type in ["T2W", "T2W_TRIM", "ADC", "T2W_FLAIR"] for img_type in data["image_type"]]]

# Distribution of survival

In [None]:
# Filter on the observations that are pre operation.
data_preop = data[data["session_status"] == "pre_op"]

# What is the distribution of "survival"? Assuming it is number of days.
# Distribution roughly 0 to 10.000 days (about 27 years), mean roughly 4500 days (12 years).
data_preop.drop_duplicates(subset=["subjetID", "diagnosis"])["survival"].plot(kind="hist")

# Atypical Teratoid Rhabdoid Tumor (ATRT)

In [None]:
data_atrt = data[data["diagnosis"] == "Atypical Teratoid Rhabdoid Tumor (ATRT)"]
print(data_atrt.shape)

fig, axs = plt.subplots(nrows=2, sharex=False)

counts_gender = np.unique(data_atrt["gender"], return_counts=True)
axs[0].bar(x=counts_gender[0], height=counts_gender[1])
axs[0].set_xlabel("Distribution of gender")

counts_eth = np.unique(data_atrt["ethnicity"], return_counts=True)
axs[1].bar(x=counts_eth[0], height=counts_eth[1])
axs[1].set_xlabel("Distribution of ethnicity")

# Figures

In [None]:
fig, axs = plt.subplots(nrows=2, sharex=False)

counts_gender = np.unique(data["gender"], return_counts=True)
axs[0].bar(x=counts_gender[0], height=counts_gender[1])
axs[0].set_xlabel("Distribution of gender")

counts_eth = np.unique(data["ethnicity"], return_counts=True)
axs[1].bar(x=counts_eth[0], height=counts_eth[1])
axs[1].set_xlabel("Distribution of ethnicity")