In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import random
import re

from tqdm import tqdm
import time

import pydicom as dicom
import nibabel as nib
import SimpleITK as sitk
import monai

import torch
import torch.nn as nn
import torch.optim as optim

from monai.networks.nets import EfficientNetBN
from monai.networks.nets import ResNet
#from efficientnet_pytorch import EfficientNet
import timm

import wandb


In [28]:
SEED = 344
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True # Fix the network according to random seed
    print('Finish seeding with seed {}'.format(seed))
    
seed_everything(SEED)
print('Training on device {}'.format(device))

Finish seeding with seed 344
Training on device cuda


In [29]:
dicom_tag_columns = [
    'Columns',
    'ImageOrientationPatient',
    'ImagePositionPatient',
    'InstanceNumber',
    'PatientID',
    'PatientPosition',
    'PixelSpacing',
    'RescaleIntercept',
    'RescaleSlope',
    'Rows',
    'SeriesNumber',
    'SliceThickness',
    'path',
    'WindowCenter',
    'WindowWidth'
]

train_dicom_tags = pd.read_parquet('autodl-tmp/train_dicom_tags.parquet', columns=dicom_tag_columns)
test_dicom_tags = pd.read_parquet('autodl-tmp/test_dicom_tags.parquet', columns=dicom_tag_columns)

train_series_meta = pd.read_csv('autodl-tmp/train_series_meta.csv')
test_series_meta = pd.read_csv('autodl-tmp/test_series_meta.csv')

train_csv = pd.read_csv('autodl-tmp/train.csv')

train_series_meta

Unnamed: 0,patient_id,series_id,aortic_hu,incomplete_organ
0,10004,21057,146.00,0
1,10004,51033,454.75,0
2,10005,18667,187.00,0
3,10007,47578,329.00,0
4,10026,29700,327.00,0
...,...,...,...,...
4706,9961,2003,381.00,0
4707,9961,63032,143.75,0
4708,9980,40214,103.00,0
4709,9980,40466,135.00,0


In [30]:
patient_id_list = train_csv.patient_id.values

series_num = np.zeros_like(patient_id_list)

for i, patient_id in enumerate(patient_id_list):
    series_list = train_series_meta.loc[train_series_meta.patient_id == patient_id]
    series_num[i] = len(series_list) - 1
series_num_pd = pd.DataFrame(columns=["series_num"], data=series_num)
train_csv_count = pd.concat([train_csv, series_num_pd], axis=1, sort=False)
train_csv_count

Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high,any_injury,series_num
0,10004,1,0,0,1,0,1,0,1,0,0,0,0,1,1,1
1,10005,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
2,10007,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
3,10026,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1
4,10051,1,0,1,0,1,0,0,1,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,9951,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
3143,9960,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
3144,9961,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1
3145,9980,1,0,1,0,1,0,0,1,0,0,0,0,1,1,1


In [46]:
val_df_fold1 = train_series_meta[0:1178].reset_index(drop=True)
val_df_fold2 = train_series_meta[1178:2356].reset_index(drop=True)
val_df_fold3 = train_series_meta[2356:3534].reset_index(drop=True)
val_df_fold4 = train_series_meta[3534:].reset_index(drop=True)

In [47]:
print(np.sum(train_csv_count.loc[train_csv.patient_id.isin(val_df_fold1.patient_id)].bowel_injury.values))
print(np.sum(train_csv_count.loc[train_csv.patient_id.isin(val_df_fold2.patient_id)].bowel_injury.values))
print(np.sum(train_csv_count.loc[train_csv.patient_id.isin(val_df_fold3.patient_id)].bowel_injury.values))
print(np.sum(train_csv_count.loc[train_csv.patient_id.isin(val_df_fold4.patient_id)].bowel_injury.values))

17
9
14
24


In [5]:
# print(np.sum(train_csv_count[0:1049].extravasation_injury.values))
# print(np.sum(train_csv_count[1049:2098].extravasation_injury.values))
# print(np.sum(train_csv_count[2098:3147].extravasation_injury.values))

In [6]:
# from sklearn.model_selection import StratifiedKFold
# skf = StratifiedKFold(n_splits=3)

# skf_y = train_csv_count.iloc[:, 1]

# skf_idx = {}

# for i, (train_idx, test_idx) in enumerate(skf.split(train_csv_count, skf_y)):
#     skf_idx[i] = test_idx

# print(np.sum(train_csv_count.iloc[skf_idx[0]].bowel_injury.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].bowel_injury.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].bowel_injury.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].extravasation_injury.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].extravasation_injury.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].extravasation_injury.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].kidney_low.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].kidney_low.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].kidney_low.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].kidney_high.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].kidney_high.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].kidney_high.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].liver_low.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].liver_low.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].liver_low.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].liver_high.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].liver_high.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].liver_high.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].spleen_low.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].spleen_low.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].spleen_low.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].spleen_high.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].spleen_high.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].spleen_high.values))

# print(np.sum(train_csv_count.iloc[skf_idx[0]].series_num.values))
# print(np.sum(train_csv_count.iloc[skf_idx[1]].series_num.values))
# print(np.sum(train_csv_count.iloc[skf_idx[2]].series_num.values))

In [7]:
# fold1_1phase_pd = (train_csv_count.iloc[skf_idx[0]]).loc[train_csv_count.series_num == 0]
# fold1_2phase_pd = (train_csv_count.iloc[skf_idx[0]]).loc[train_csv_count.series_num == 1]
# print(np.sum(fold1_1phase_pd.bowel_injury.values))
# print(np.sum(fold1_2phase_pd.bowel_injury.values))

# print(np.sum(fold1_1phase_pd.kidney_high.values))
# print(np.sum(fold1_2phase_pd.kidney_high.values))

# print(np.sum(fold1_1phase_pd.liver_high.values))
# print(np.sum(fold1_2phase_pd.liver_high.values))

# print(np.sum(fold1_1phase_pd.spleen_high.values))
# print(np.sum(fold1_2phase_pd.spleen_high.values))

In [8]:
# fold2_1phase_pd = (train_csv_count.iloc[skf_idx[1]]).loc[train_csv_count.series_num == 0]
# fold2_2phase_pd = (train_csv_count.iloc[skf_idx[1]]).loc[train_csv_count.series_num == 1]
# print(np.sum(fold2_1phase_pd.bowel_injury.values))
# print(np.sum(fold2_2phase_pd.bowel_injury.values))

# print(np.sum(fold2_1phase_pd.kidney_high.values))
# print(np.sum(fold2_2phase_pd.kidney_high.values))

# print(np.sum(fold2_1phase_pd.liver_high.values))
# print(np.sum(fold2_2phase_pd.liver_high.values))

# print(np.sum(fold2_1phase_pd.spleen_high.values))
# print(np.sum(fold2_2phase_pd.spleen_high.values))

In [9]:
# fold3_1phase_pd = (train_csv_count.iloc[skf_idx[2]]).loc[train_csv_count.series_num == 0]
# fold3_2phase_pd = (train_csv_count.iloc[skf_idx[2]]).loc[train_csv_count.series_num == 1]
# print(np.sum(fold3_1phase_pd.bowel_injury.values))
# print(np.sum(fold3_2phase_pd.bowel_injury.values))

# print(np.sum(fold3_1phase_pd.kidney_high.values))
# print(np.sum(fold3_2phase_pd.kidney_high.values))

# print(np.sum(fold3_1phase_pd.liver_high.values))
# print(np.sum(fold3_2phase_pd.liver_high.values))

# print(np.sum(fold3_1phase_pd.spleen_high.values))
# print(np.sum(fold3_2phase_pd.spleen_high.values))

In [10]:
# fold1_1phase_pd = (train_csv_count.iloc[skf_idx[0]]).loc[train_csv_count.series_num == 0]
# fold1_1phase_pd = fold1_1phase_pd.reset_index()
# fold1_1phase_pd.to_csv('autodl-tmp/train_3fold/fold1_1phase.csv', index=None)
# fold1_1phase_pd

In [11]:
# fold1_2phase_pd = (train_csv_count.iloc[skf_idx[0]]).loc[train_csv_count.series_num == 1]
# fold1_2phase_pd = fold1_2phase_pd.reset_index()
# fold1_2phase_pd.to_csv('autodl-tmp/train_3fold/fold1_2phase.csv', index=None)
# fold1_2phase_pd

In [12]:
# fold2_1phase_pd = (train_csv_count.iloc[skf_idx[1]]).loc[train_csv_count.series_num == 0]
# fold2_1phase_pd = fold2_1phase_pd.reset_index()
# fold2_1phase_pd.to_csv('autodl-tmp/train_3fold/fold2_1phase.csv', index=None)
# fold2_1phase_pd

In [13]:
# fold2_2phase_pd = (train_csv_count.iloc[skf_idx[1]]).loc[train_csv_count.series_num == 1]
# fold2_2phase_pd = fold2_2phase_pd.reset_index()
# fold2_2phase_pd.to_csv('autodl-tmp/train_3fold/fold2_2phase.csv', index=None)
# fold2_2phase_pd

In [14]:
# fold3_1phase_pd = (train_csv_count.iloc[skf_idx[2]]).loc[train_csv_count.series_num == 0]
# fold3_1phase_pd = fold3_1phase_pd.reset_index()
# fold3_1phase_pd.to_csv('autodl-tmp/train_3fold/fold3_1phase.csv', index=None)
# fold3_1phase_pd

In [15]:
# fold3_2phase_pd = (train_csv_count.iloc[skf_idx[2]]).loc[train_csv_count.series_num == 1]
# fold3_2phase_pd = fold3_2phase_pd.reset_index()
# fold3_2phase_pd.to_csv('autodl-tmp/train_3fold/fold3_2phase.csv', index=None)
# fold3_2phase_pd

In [16]:
fold1_1phase_csv = pd.read_csv('autodl-tmp/train_3fold/fold1_1phase.csv')
fold2_1phase_csv = pd.read_csv('autodl-tmp/train_3fold/fold2_1phase.csv')
fold3_1phase_csv = pd.read_csv('autodl-tmp/train_3fold/fold3_1phase.csv')
fold1_2phase_csv = pd.read_csv('autodl-tmp/train_3fold/fold1_2phase.csv')
fold2_2phase_csv = pd.read_csv('autodl-tmp/train_3fold/fold2_2phase.csv')
fold3_2phase_csv = pd.read_csv('autodl-tmp/train_3fold/fold3_2phase.csv')

In [26]:
patient_id = fold1_2phase_csv.loc[20, ["patient_id"]].values[0].astype('int')
series_id, hu = train_series_meta.loc[train_series_meta.patient_id == patient_id, ["series_id", 'aortic_hu']].values[0].astype('int')
patient_id, series_id, hu

(10722, 46573, 371)