# Experimental Data Preprocessing -> Considering image_level_labels

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
import os

In [3]:
data_labels = pd.read_csv("/kaggle/input/rsna-2023-abdominal-trauma-detection/image_level_labels.csv")
data_labels.head()

Unnamed: 0,patient_id,series_id,instance_number,injury_name
0,10004,21057,362,Active_Extravasation
1,10004,21057,363,Active_Extravasation
2,10004,21057,364,Active_Extravasation
3,10004,21057,365,Active_Extravasation
4,10004,21057,366,Active_Extravasation


In [4]:
data_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12029 entries, 0 to 12028
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient_id       12029 non-null  int64 
 1   series_id        12029 non-null  int64 
 2   instance_number  12029 non-null  int64 
 3   injury_name      12029 non-null  object
dtypes: int64(3), object(1)
memory usage: 376.0+ KB


In [5]:
data_labels.loc[data_labels["injury_name"] == "Active_Extravasation"]

Unnamed: 0,patient_id,series_id,instance_number,injury_name
0,10004,21057,362,Active_Extravasation
1,10004,21057,363,Active_Extravasation
2,10004,21057,364,Active_Extravasation
3,10004,21057,365,Active_Extravasation
4,10004,21057,366,Active_Extravasation
...,...,...,...,...
12024,9632,3750,155,Active_Extravasation
12025,9632,3750,156,Active_Extravasation
12026,9632,3750,157,Active_Extravasation
12027,9632,3750,158,Active_Extravasation


In [6]:
data_labels.loc[data_labels["injury_name"] == "Bowel"]

Unnamed: 0,patient_id,series_id,instance_number,injury_name
105,10065,37324,48,Bowel
106,10065,37324,49,Bowel
107,10065,37324,50,Bowel
108,10065,37324,51,Bowel
109,10065,37324,52,Bowel
...,...,...,...,...
11987,8684,38440,147,Bowel
11988,8684,38440,148,Bowel
11989,8684,38440,149,Bowel
11990,8684,38440,150,Bowel


In [7]:
patients = data_labels["patient_id"].unique().tolist()
len(patients)

246

In [8]:
series = data_labels["series_id"].unique().tolist()
len(series)

330

### We are going to focus only on extravasation injury, considering extravasation injury only and also patients that have both, extravasation and bowel injury, total of 200 patients.

In [70]:
Extravasation = data_labels.loc[data_labels["injury_name"] == "Active_Extravasation"]
patients_extravasation = Extravasation["patient_id"].unique().tolist()
len(patients_extravasation)

200

In [71]:
Extravasation = Extravasation.reset_index(drop=True)
Extravasation

Unnamed: 0,patient_id,series_id,instance_number,injury_name
0,10004,21057,362,Active_Extravasation
1,10004,21057,363,Active_Extravasation
2,10004,21057,364,Active_Extravasation
3,10004,21057,365,Active_Extravasation
4,10004,21057,366,Active_Extravasation
...,...,...,...,...
6365,9632,3750,155,Active_Extravasation
6366,9632,3750,156,Active_Extravasation
6367,9632,3750,157,Active_Extravasation
6368,9632,3750,158,Active_Extravasation


In [92]:
unique_pairs = Extravasation[['patient_id', 'series_id']].drop_duplicates()
unique_pairs = unique_pairs.reset_index(drop=True)
unique_pairs

Unnamed: 0,patient_id,series_id
0,10004,21057
1,10004,51033
2,10217,16066
3,10292,14945
4,10494,65369
...,...,...
254,820,11921
255,820,38809
256,8263,30011
257,9528,1989


In [94]:
unique_pairs["patient_id"]

10004

In [113]:
def extract_injury_instances(data, patient_id, series_id):
    patient_data = data.loc[(data["patient_id"] == patient_id) & (data["series_id"] == series_id)]  
    return patient_data["instance_number"].tolist()

def image_label_data_cleaned(data, unique_pairs_ps):

    instances = []
    category = [1 for i in range(len(unique_pairs_ps))]
    
    for i, patient in enumerate(unique_pairs_ps["patient_id"]):               
        patient_instances = extract_injury_instances(data, patient, unique_pairs_ps["series_id"][i])
        instances.append(patient_instances)
    
    final_data = pd.DataFrame(list(zip(unique_pairs_ps["patient_id"], unique_pairs_ps["series_id"], instances, category)),
               columns =["patient_id","series_id", "instances", "category"])
    
    return final_data

In [114]:
cleaned_data = image_label_data_cleaned(Extravasation, unique_pairs)

In [115]:
cleaned_data

Unnamed: 0,patient_id,series_id,instances,category
0,10004,21057,"[362, 363, 364, 365, 366, 367, 368, 369, 370, ...",1
1,10004,51033,"[376, 377, 378, 379, 380, 381, 382, 383, 384, ...",1
2,10217,16066,"[256, 257, 258, 259, 260, 261, 262, 263, 264, ...",1
3,10292,14945,"[20, 21, 22, 23, 24, 25, 26]",1
4,10494,65369,"[292, 293, 294, 295, 296, 297, 298, 299, 300, ...",1
...,...,...,...,...
254,820,11921,"[86, 87, 88, 89, 90]",1
255,820,38809,"[205, 206, 207, 208]",1
256,8263,30011,"[120, 121, 123, 124]",1
257,9528,1989,"[207, 208, 209, 210, 211, 212, 213, 214, 215, ...",1
