In [1]:
import tarfile
import pandas as pd
import os
from shutil import copyfile
from torchvision import datasets, transforms

In [2]:
# Read the CSV file
csv_file_path = 'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/Data_Entry_2017_v2020.csv'
csv_data = pd.read_csv(csv_file_path)

# Read train and test indices from text files
train_indices_file = 'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/train_val_list.txt'
test_indices_file = 'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/test_list.txt'

In [3]:
with open(train_indices_file, 'r') as file:
    train_indices = file.read().splitlines()

with open(test_indices_file, 'r') as file:
    test_indices = file.read().splitlines()

In [4]:
def extract_first_class(label):
    return label.split('|')[0]  

In [5]:
# Mapping image filenames to their labels from the CSV file
image_label_map = {}
for index, row in csv_data.iterrows():
    image_index = row['Image Index']
    finding_label = row['Finding Labels']
    image_label_map[image_index] = extract_first_class(finding_label)

In [6]:
def extract_images_from_tar(tar_filenames, indices):
    total_images = len(indices)
    extracted_images = {}
    images_extracted = 0
    percent = 0

    for tar_filename in tar_filenames:
        with tarfile.open(tar_filename, 'r') as tar:
            for member in tar.getmembers():
                if member.isfile() and member.name.split('/')[-1] in indices:
                    if 'images/' in member.name:  # Considering 'images' as the folder name within tar
                        img = tar.extractfile(member)
                        img_data = img.read()
                        extracted_images[member.name.split('/')[-1]] = img_data
                        images_extracted += 1

                        # Calculate percentage completion and display progress every 5%
                        current_percent = int((images_extracted / total_images) * 100)
                        if current_percent >= percent + 5:
                            percent = current_percent
                            print(f"{percent}% of images extracted")

    return extracted_images

In [10]:
# List of paths to all the tar files
tar_files = ['C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_001.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_002.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_003.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_004.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_005.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_006.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_007.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_008.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_009.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_010.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_011.tar.gz',
             'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/CXR8/CXR8/images/images_012.tar.gz']  # Include paths to all 12 tar files

# Extract train and test images from all tar files
train_images = extract_images_from_tar(tar_files, train_indices)
test_images = extract_images_from_tar(tar_files, test_indices)

# Create train and test folders to organize extracted images
train_folder = 'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/dataset_2/train'
test_folder = 'C:/Users/rjuya/OneDrive/Desktop/github stuff/EE5610/Project/Data/dataset_2/test'

os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

5% of images extracted
10% of images extracted
15% of images extracted
20% of images extracted
25% of images extracted
30% of images extracted
35% of images extracted
40% of images extracted
45% of images extracted
50% of images extracted
55% of images extracted
60% of images extracted
65% of images extracted
70% of images extracted
75% of images extracted
80% of images extracted
85% of images extracted
90% of images extracted
95% of images extracted
100% of images extracted
5% of images extracted
10% of images extracted
15% of images extracted
20% of images extracted
25% of images extracted
30% of images extracted
35% of images extracted
40% of images extracted
45% of images extracted
50% of images extracted
55% of images extracted
60% of images extracted
65% of images extracted
70% of images extracted
75% of images extracted
80% of images extracted
85% of images extracted
90% of images extracted
95% of images extracted
100% of images extracted


In [None]:
for filename, img_data in train_images.items():
    label = image_label_map[filename]
    class_folder = os.path.join(train_folder, label)
    os.makedirs(class_folder, exist_ok=True)
    with open(os.path.join(class_folder, filename), 'wb') as file:
        file.write(img_data)

In [11]:
for filename, img_data in test_images.items():
    label = image_label_map[filename]
    class_folder = os.path.join(test_folder, label)
    os.makedirs(class_folder, exist_ok=True)
    with open(os.path.join(class_folder, filename), 'wb') as file:
        file.write(img_data)

In [None]:

# Define transforms if needed
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    # Add more transformations as needed
])

# Create ImageFolder datasets for train and test
train_dataset = datasets.ImageFolder(root=train_folder, transform=transform)
test_dataset = datasets.ImageFolder(root=test_folder, transform=transform)


In [None]:
train_dataset[0][0].shape

torch.Size([3, 256, 256])