In [1]:
import pandas as pd
import numpy as np
import os
# load df_new
new_dataset_info = pd.read_excel('/home/ubuntu/tenerife/data/ICH_tabular_data/UPDATED_Hemorragias_Amaia_new_dataset_info.xlsx')

In [2]:
# Set data directory
directory = "/home/ubuntu/tenerife/data/ICH_nii"
# read images and corresponding labels from directory
goodPrognosis_images = sorted(os.listdir(os.path.join(directory, "GOOD_PROGNOSIS")))
print(f"Good prognosis images(0): {len(goodPrognosis_images)}")
PoorPrognosis_images = sorted(os.listdir(os.path.join(directory, "POOR_PROGNOSIS")))
print(f"Poor prognosis images (1): {len(PoorPrognosis_images)}")

# read CLINICAL DATA
clinical_data = pd.read_csv('/home/ubuntu/tenerife/data/ICH_tabular_data/CLINICAL_DATA_ICH.csv', delimiter=',')
clinical_data.columns
clinical_data = clinical_data.rename(columns={'PatientID': 'Patient'})
# drop row where Patient is 213
clinical_data_filtered = clinical_data[~clinical_data['Patient'].isin([213])]
clinical_data_filtered = clinical_data_filtered.reset_index(drop=True)

# reshape the input tensor to merge the depth dimension into the channel dimension to fit in timm model 
def reshape_input(x):
    batch_size, channels, depth, height, width = x.size()
    x = x.view(batch_size, channels * depth, height, width)
    return x

images_all=[]
labels_all=[]
# loop over Patient in clinical_data
for i in range(len(clinical_data)):
    # get patient ID
    patientID = clinical_data['Patient'][i]
    # get label
    label = clinical_data['Label (poor_prognosis)'][i]
    # read corresponding image
    if label==0:
        # check that image exists
        if os.path.isfile(os.path.join(directory, "GOOD_PROGNOSIS", str(patientID) + ".nii.gz")):
            images_all.append(os.path.join(directory, "GOOD_PROGNOSIS", str(patientID) + ".nii.gz"))
            labels_all.append(label)
    elif label==1:
        # check that image exists
        if os.path.isfile(os.path.join(directory, "POOR_PROGNOSIS", str(patientID) + ".nii.gz")):
            images_all.append(os.path.join(directory, "POOR_PROGNOSIS", str(patientID) + ".nii.gz"))
            labels_all.append(label)
    else:
        print("ERROR: for Patient", patientID, "label not found")
print("Number of images:", len(images_all))
print("Number of labels:", len(labels_all))
images_all=np.array(images_all)
labels_all=np.array(labels_all)

Good prognosis images(0): 99
Poor prognosis images (1): 162
Number of images: 261
Number of labels: 261


In [4]:
# create pandas df tostore for each image the ID, manufacturer, insitutionName and label
df_images = pd.DataFrame(columns=['Patient', 'Manufacturer', 'InstitutionName', 'Label'])

# iterate over images_all
for i in range(len(images_all)):
    # get image path
    image_path = images_all[i]
    # get image name
    image_name = os.path.basename(image_path)
    # get patient ID
    patientID = image_name.split(".")[0]
    # get label
    label = labels_all[i]
    # get image info from new_dataset_info
    image_info = new_dataset_info[new_dataset_info['PatientID']==int(patientID)]
    # get manufacturer for that image
    manufacturer = image_info['Manufacturer'].values[0]
    # get InstitutionName
    institutionName = image_info['InstitutionName'].values[0]
    # store in df_images
    df_images.loc[i] = [patientID, manufacturer, institutionName, label]


In [31]:
import nibabel as nib
import matplotlib.pyplot as plt
from monai.transforms import Compose, LoadImaged, Rotate90d, Flipd, NormalizeIntensityd, ThresholdIntensityd, Resized, SpatialPadd, Transposed

# Load the NIfTI file
ct_scan = nib.load(images_all[0])

# Get the image data from the NIfTI file
ct_data = ct_scan.get_fdata()

# Determine the number of slices in the CT scan
num_slices = ct_data.shape[-1]
image_shape=512
# Choose 8 slices evenly spaced throughout the scan (you can adjust this as needed)
selected_slices_indices = range(0, num_slices+1,4)

val_transforms = Compose([
    LoadImaged(keys="image", image_only=True, ensure_channel_first=True, reader="ITKReader"), 
    Rotate90d(keys="image", k=3), 
    Flipd(keys="image", spatial_axis=1),
    # NormalizeIntensityd(keys="image", subtrahend=15, divisor=85),
    # ThresholdIntensityd(keys="image", threshold=0, above=True, cval=0.0),
    # ThresholdIntensityd(keys="image", threshold=1, above=False, cval=0.0),
    Resized(keys="image", spatial_size=[image_shape, image_shape, -1]),
    SpatialPadd(keys="image", spatial_size=[-1, -1, -1], mode='constant', method='symmetric'),
    Transposed(keys="image", indices=[0, 3, 1, 2])
])

preprocessed_volume = val_transforms({'image': images_all[0]})

# Extract the preprocessed volume
preprocessed_data = preprocessed_volume['image']
image=preprocessed_data[0]

for i, slice_index in enumerate(selected_slices_indices): 
    print(i, slice_index)   
    # Plot and save the preprocessed image as PNG
    # plot preprocessed data for that slice
    plt.figure(figsize=(10, 10))
    plt.imshow(image[i, :, :], cmap='gray')
    plt.axis('off')
    plt.savefig(f'/home/ubuntu/tenerife/data/ICH_results/FiguresPresentation/BeforePreprocessing/BeforePreprocessing_slice_{slice_index}.png', bbox_inches='tight', pad_inches=0, dpi=300)
    plt.close()


0 0
1 4
2 8
3 12
4 16
5 20
6 24


torch.Size([24, 512, 512])

In [5]:
df_images.InstitutionName.value_counts()

InstitutionName
Hospital Valdecilla    62
HOSPITAL VALDECILLA    34
PMSTL                  21
Name: count, dtype: int64

In [6]:
df_images.Manufacturer.value_counts()

Manufacturer
GE MEDICAL SYSTEMS    163
Philips                98
Name: count, dtype: int64

In [9]:
# see distribution of df_images by manufacturer and label
df_images.groupby(['Manufacturer', 'Label']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Patient,InstitutionName
Manufacturer,Label,Unnamed: 2_level_1,Unnamed: 3_level_1
GE MEDICAL SYSTEMS,0,60,25
GE MEDICAL SYSTEMS,1,103,37
Philips,0,39,21
Philips,1,59,34


### Revise clinical data

In [6]:
# clinical data
df = clinical_data_filtered
# rename patient column
# df = df.rename(columns={'Patient': 'PatientID'})

In [14]:
from sklearn.preprocessing import OneHotEncoder
# Create the OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False)

categorical_var=['Gender', 'Smoker', 'Alcohol', 'HT',
        'DM', 'Dyslipidemia', 'MH_ICH', 'MH_CVD', 'MH_neuro', 'MH_dementia',
        'MH_cancer', 'MH_hematho', 'MH_others', 'Anticoagulant',
        'Antiaggregant', 'Antihypertensive_drugs', 'Calcium_antag', 'Alpha_blockers', 'PE_neuro', 'Cause_head_trauma']
    

# Fit the encoder on the training data and transform the training, test, and validation data
encoded_train = one_hot_encoder.fit_transform(df[categorical_var])
# Convert these columns into a DataFrame
encoded_train_df = pd.DataFrame(encoded_train, columns=one_hot_encoder.get_feature_names_out(categorical_var))
encoded_train_df.head()

Unnamed: 0,Gender_0,Gender_1,Smoker_0,Smoker_1,Alcohol_0,Alcohol_1,HT_0,HT_1,DM_0,DM_1,...,Antihypertensive_drugs_0,Antihypertensive_drugs_1,Calcium_antag_0,Calcium_antag_1,Alpha_blockers_0,Alpha_blockers_1,PE_neuro_0,PE_neuro_1,Cause_head_trauma_0,Cause_head_trauma_1
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [25]:
b=np.array([1,2,3,4])
a=np.full((len(b)), 1)
combined = np.column_stack((b, a))

In [26]:
# create empty df with 2 columns
df_encoded = pd.DataFrame(columns=['b', 'a'])
df_prueba=pd.concat([df_encoded, pd.DataFrame(combined, columns=['b', 'a'])], ignore_index=True)
df_prueba

Unnamed: 0,b,a
0,1,1
1,2,1
2,3,1
3,4,1


In [28]:
df_prueba=pd.concat([df_prueba, pd.DataFrame(combined, columns=['b', 'a'])], ignore_index=True)
df_prueba

Unnamed: 0,b,a
0,1,1
1,2,1
2,3,1
3,4,1
4,1,1
5,2,1
6,3,1
7,4,1


In [121]:
df['GCS'].value_counts()

GCS
15    60
11    52
14    43
3     22
12    13
13    13
10    11
8      9
5      9
9      8
7      8
4      8
6      5
Name: count, dtype: int64

In [8]:
df.columns

Index(['PatientID', 'Label (poor_prognosis)', 'Gender', 'Age',
       'Antihypertensive_drugs', 'Smoker', 'Alcohol', 'HT', 'DM',
       'Dyslipidemia', 'MH_ICH', 'MH_CVD', 'MH_neuro', 'MH_dementia',
       'MH_cancer', 'MH_hematho', 'MH_others', 'Anticoagulant',
       'Antiaggregant', 'Calcium_antag', 'Alpha_blockers', 'Systolic_AP',
       'Dyastolic_AP', 'OxSat', 'Tº', 'HR', 'RF', 'PE_neuro', 'GCS', 'Glucose',
       'Creatinine', 'Urea', 'Sodium', 'Potassium', 'WBC', 'Hgb', 'Platelets',
       'MCV', 'RCDW', 'CMHC', 'MPV', 'INR', 'Fibrinogen'],
      dtype='object')

In [62]:
# read csv /home/ubuntu/tenerife/data/ICH_tabular_data/AcudePorTraumatismos.csv
acude_por_traumatismo = pd.read_csv('/home/ubuntu/tenerife/data/ICH_tabular_data/AcudePorTraumatismos.csv', delimiter=',')
causa_traumatismo = pd.read_csv('/home/ubuntu/tenerife/data/ICH_tabular_data/ALL_sinnormalizar_CausaTraumatismo.csv', delimiter=',')

In [66]:
acude_por_traumatismo = acude_por_traumatismo.rename(columns={'Marca temporal': 'PatientID', 'Acude por traumatismo': 'acude_por_traumatismo'})
acude_por_traumatismo.columns

Index(['PatientID', 'acude_por_traumatismo'], dtype='object')

In [71]:
causa_traumatismo = causa_traumatismo.rename(columns={'Marca temporal': 'PatientID'})
causa_traumatismo.columns

Index(['PatientID', 'Muerte', 'SEXO', 'Age', 'Tratamiento_antihipertensivo',
       'Fumador', 'Alcohol', 'HTA', 'DM', 'Dislipemia',
       'AP.de.hemorragias.intracraneales.previas',
       'AP.de.enfermedades.cardiovasculares',
       'AP.de.enfermedades.neurológicas', 'AP.deterioro.cognitivo.demencia',
       'AP.de.enfermedades.tumorales', 'AP.de.enfermedades.hematológicas',
       'AP.de.otras.enfermedades.mayores', 'Tratamiento.Anticoagulante',
       'Tratamiento.Antiagregante', 'Antihipertensivos.Calcioantagonistas',
       'Antihipertensivos.Alfa.bloqueantes', 'TA.sistólica', 'TA.diastólica',
       'Sat02.', 'Tº', 'FC', 'FR', 'EF_Alteraciones.neurológicas', 'GCS',
       'Número.de.días.de.ingreso.en.UCI', 'Causa.del.sangrado..Idiopática',
       'Causa.del.sangrado..Causa.tratable', 'Causa.del.sangrado..Traumatismo',
       'Causa.del.sangrado..Aneurisma', 'NeuroQx', 'Glucosa', 'Creatinina',
       'Urea', 'Sodio', 'Potasio', 'Leucocitos', 'Hb', 'Plaquetas', 'VCM',
       'A

In [67]:
# check if column acude_por_traumatismo in acude_por_traumatismo is the same as 
# Causa.del.sangrado..Traumatismo. in causa_traumatismo
acude_por_traumatismo.acude_por_traumatismo.value_counts()

acude_por_traumatismo
0.0    66
1.0    49
Name: count, dtype: int64

In [69]:
causa_traumatismo['Causa.del.sangrado..Traumatismo'].value_counts()

Causa.del.sangrado..Traumatismo
0    180
1    146
Name: count, dtype: int64

In [74]:
# see if all patients in df have a value in causa_traumatismo
df[df['PatientID'].isin(causa_traumatismo['PatientID'])==False]

Unnamed: 0,PatientID,Label (poor_prognosis),Gender,Age,Antihypertensive_drugs,Smoker,Alcohol,HT,DM,Dyslipidemia,...,Potassium,WBC,Hgb,Platelets,MCV,RCDW,CMHC,MPV,INR,Fibrinogen


In [113]:
# include column Causa.del.sangrado..Traumatismo in df merging only patients that are in df
df_updated = pd.merge(df, causa_traumatismo[['PatientID', 'Causa.del.sangrado..Traumatismo']].drop_duplicates(), on='PatientID', how='left')

In [114]:
df_updated['Causa.del.sangrado..Traumatismo'].value_counts()

Causa.del.sangrado..Traumatismo
0    149
1    112
Name: count, dtype: int64

In [115]:
df_updated.shape

(261, 44)

In [118]:
# rename column Causa.del.sangrado..Traumatismo to Cause_head_trauma
df_updated = df_updated.rename(columns={'Causa.del.sangrado..Traumatismo': 'Cause_head_trauma'})
df_updated

Unnamed: 0,PatientID,Label (poor_prognosis),Gender,Age,Antihypertensive_drugs,Smoker,Alcohol,HT,DM,Dyslipidemia,...,WBC,Hgb,Platelets,MCV,RCDW,CMHC,MPV,INR,Fibrinogen,Cause_head_trauma
0,1,1,1,74,1,0,0,1,1,1,...,5.1,13.3,107,85.0,17.0,33.0,9.0,4.21,344,1
1,3,0,0,78,0,0,0,0,0,1,...,6.1,12.2,219,91.0,14.0,33.1,8.7,3.16,298,0
2,4,1,0,79,0,0,0,0,0,1,...,6.9,12.7,290,86.0,15.8,34.1,7.6,1.09,344,0
3,6,1,0,88,0,0,0,0,0,0,...,5.5,14.5,217,98.0,13.4,32.8,7.9,0.98,332,0
4,7,0,0,43,0,0,0,0,0,0,...,10.0,13.3,335,86.0,13.1,34.2,8.1,1.13,534,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,294,1,1,74,0,0,0,1,0,0,...,7.7,11.6,139,91.0,14.3,34.6,8.2,2.72,410,1
257,296,0,1,80,1,0,1,1,0,0,...,5.5,15.1,192,93.0,14.5,33.1,8.7,1.11,465,1
258,298,1,0,55,0,0,1,1,1,1,...,7.4,11.4,183,97.0,15.4,32.5,8.7,1.22,838,0
259,299,1,1,59,0,0,0,1,1,1,...,16.4,13.7,210,86.0,13.4,33.1,7.8,0.96,217,0


In [119]:
# update csv with df_updated
df_updated.to_csv('/home/ubuntu/tenerife/data/ICH_tabular_data/CLINICAL_DATA_ICH.csv', index=False)