In [1]:
import os
from PIL import Image
import csv
import pandas as pd
import json
from tqdm.notebook import tqdm as tq
import torch
from torch.utils import data
import numpy as np

In [2]:
# Load all JSON files

img_path = '/home/rakshith/Datasets/CXR8/images/images/'

with open('cxr8_labels.json') as f1:
    label_json_file = json.load(f1)

with open('data_split.json') as f2:
    data_split_file = json.load(f2)

with open('problem_files.json') as f3:
    problem_images = json.load(f3)

with open('no_finding.json') as f4:
        no_findings = json.load(f4)

In [3]:
# Data Loader


class ChestDataLoader(data.Dataset):
    def __init__(self, label_json, name_list, is_transform=True):

        self.label_json = label_json
        self.files = name_list
        self.is_transform = is_transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, index):

        filename = self.files[index]
        label_list = self.label_json[filename]
        Atelectasis, Cardiomegaly, Effusion, Infiltration = 0, 0, 0, 0
        Mass, Nodule, Pneumonia, Pneumothorax = 0, 0, 0, 0
        Consolidation, Edema, Emphysema, Fibrosis = 0, 0, 0, 0
        Pleural_Thickening, Hernia= 0, 0

        for label in label_list:
            if label == 'Atelectasis':
                Atelectasis = 1
            elif label == 'Cardiomegaly':
                Cardiomegaly = 1
            elif label == 'Effusion':
                Effusion = 1
            elif label == 'Infiltration':
                Infiltration = 1
            elif label == 'Mass':
                Mass = 1
            elif label == 'Nodule':
                Nodule = 1
            elif label == 'Pneumonia':
                Pneumonia = 1
            elif label == 'Pneumothorax':
                Pneumothorax = 1
            elif label == 'Consolidation':
                Consolidation = 1
            elif label == 'Edema':
                Edema = 1
            elif label == 'Emphysema':
                Emphysema = 1
            elif label == 'Fibrosis':
                Fibrosis = 1
            elif label == 'Pleural_Thickening':
                Pleural_Thickening = 1
            elif label == 'Hernia':
                Hernia = 1
            else:
                print(f'Filename:{filename}| Label:{label}')

        label_final = [Atelectasis, Cardiomegaly, Effusion, Infiltration,
                        Mass, Nodule, Pneumonia, Pneumothorax,
                        Consolidation, Edema, Emphysema, Fibrosis,
                        Pleural_Thickening, Hernia]
        label_array = np.array(label_final)
        label_tensor = torch.FloatTensor(label_array)


        return label_tensor

train_list = [file for file in data_split_file['train'] if file not in problem_images]
train_list = [file for file in train_list if file not in no_findings]

trainDset = ChestDataLoader(label_json = label_json_file,
                            name_list=train_list,
                            is_transform=True)

trainDataLoader = data.DataLoader(
                                trainDset, batch_size=1000, shuffle=True,
                                num_workers=10, pin_memory=True)


In [5]:
for file_name in tq(train_list):
    label = label_json_file[file_name]
    if label == 'No Finding':
        print(file_name)
    # print(file_name)
    # break

  0%|          | 0/59907 [00:00<?, ?it/s]

In [None]:
class_sum = torch.zeros(14).cuda()
for i,data_sample in enumerate(tq(trainDataLoader)):
    img, label = data_sample
    label = label.cuda()
    # print(label.shape)
    # print(label)
    class_sum += torch.sum(label, dim=0)
    if i>25:
        break
print(class_sum)

In [None]:
key_list = list(label_json_file.keys())

# print(len(key_list))
label_list = []
for i in key_list:
    label = label_json_file[i]
    if label[0] == 'No Finding':
        if len(label_list) < 10000:
            label_list.append(i)

# print(np.unique(label_list))
# label_list[0]
with open('no_finding.json','w') as f2:
    json.dump(label_list, f2)

In [None]:
image_dir = '/home/rakshith/Datasets/CXR8/images/images/'
images_list = os.listdir(image_dir)
problem_images = []
# print(len(images_list))
from torchvision import transforms
import math
sum = 0
numerator = 0
for i in tq(range(len(images_list))):
    image_sample = Image.open(image_dir+images_list[i])
    image_func = transforms.ToTensor()
    image_sample = image_func(image_sample).cuda()
    sum += torch.sum(image_sample)
    # print(image_sample.shape)

mean = sum/(len(images_list)*1024*1024)

for i in tq(range(len(images_list))):
    image_sample = Image.open(image_dir+images_list[i])
    image_func = transforms.ToTensor()
    image_sample = image_func(image_sample).cuda()
    numerator += torch.sum((image_sample - mean)**2)

std = math.sqrt(numerator/(len(images_list)*1024*1024))

print('Mean:',mean)
print('SD:',std)
    # print(torch.unique(image_sample))
    # break
    # channel = image_sample.shape[0]
    # if channel == 4:
    #     problem_images.append(images_list[i])

# print(len(problem_images))

In [None]:
csv_file = pd.read_csv('/home/rakshith/Datasets/CXR8/Data_Entry_2017_v2020.csv')
csv_file['Image Index'][0]
csv_file['Finding Labels'][0]
len(csv_file)

labels_dict = {}

for idx in tq(range(len(csv_file))):
    label_list = []
    image_name = csv_file['Image Index'][idx]
    label = csv_file['Finding Labels'][idx]
    
    if '|' in label:
        labels = label.split('|')
        label_list = label_list + labels
    else:
        label_list.append(label)

    labels_dict[image_name] = label_list


In [None]:
with open('problem_files.json', 'w') as f1:
    json.dump(problem_images, f1)

In [None]:
with open('cxr8_labels.json', 'w') as f1:
    json.dump(labels_dict, f1)

In [None]:

with open('/home/rakshith/Datasets/CXR8/train_val_list.txt') as f:
    lines = f.readlines()

with open('/home/rakshith/Datasets/CXR8/test_list.txt') as f2:
    lines2 = f2.readlines()

train_list = []
for idx in range(0, int(0.8*len(lines))):
    filename = lines[idx][0:-1]
    train_list.append(filename)

val_list = []
for idx in range(int(0.8*len(lines)), len(lines)):
    filename = lines[idx][0:-1]
    val_list.append(filename)

test_list = []
for idx in range(len(lines2)):
    filename = lines2[idx][0:-1]
    test_list.append(filename)
print(len(train_list))
print(len(val_list))
print(len(test_list))


In [None]:
split_dict = {
    'train':train_list,
    'val':val_list,
    'test':test_list
}
import json
with open('data_split.json', 'w') as f1:
    json.dump(split_dict, f1)