# Processing Data Entry csv file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import time
import random
import pandas as pd
from collections import defaultdict

In [None]:
csv_file_path = '/content/drive/My Drive/MLOPs Project/Data_Entry.csv'
csv_data = pd.read_csv(csv_file_path)

In [None]:
label_freq = defaultdict(int)
images_per_label = defaultdict(list)

for _, row in csv_data.iterrows():
    image_index = row['Image Index']
    labels = row['Finding Labels'].split('|')

    for label in labels:
        label = label.strip()
        label_freq[label] += 1
        images_per_label[label].append(image_index)

print("Label Frequencies:")
for label, freq in label_freq.items():
    print(f"{label}: {freq}")

label_freq_df = pd.DataFrame(list(label_freq.items()), columns=['Label', 'Frequency'])
label_freq_df = label_freq_df.sort_values(by='Frequency')
print(label_freq_df)

Label Frequencies:
Cardiomegaly: 2776
Emphysema: 2516
Effusion: 13317
No Finding: 60361
Hernia: 227
Infiltration: 19894
Mass: 5782
Nodule: 6331
Atelectasis: 11559
Pneumothorax: 5302
Pleural_Thickening: 3385
Pneumonia: 1431
Fibrosis: 1686
Edema: 2303
Consolidation: 4667
                 Label  Frequency
4               Hernia        227
11           Pneumonia       1431
12            Fibrosis       1686
13               Edema       2303
1            Emphysema       2516
0         Cardiomegaly       2776
10  Pleural_Thickening       3385
14       Consolidation       4667
9         Pneumothorax       5302
6                 Mass       5782
7               Nodule       6331
8          Atelectasis      11559
2             Effusion      13317
5         Infiltration      19894
3           No Finding      60361


In [None]:
sorted_labels = sorted(label_freq, key=label_freq.get)
print("Labels sorted by frequency:", sorted_labels)

Labels sorted by frequency: ['Hernia', 'Pneumonia', 'Fibrosis', 'Edema', 'Emphysema', 'Cardiomegaly', 'Pleural_Thickening', 'Consolidation', 'Pneumothorax', 'Mass', 'Nodule', 'Atelectasis', 'Effusion', 'Infiltration', 'No Finding']


In [None]:
target_sample_size = 900

selected_images = set()
selected_images_per_label = defaultdict(list)
label_count = defaultdict(int)

start= time.time()

for label in sorted_labels:
    images = images_per_label[label]
    required_samples = min(target_sample_size - label_count[label], len(images))

    if required_samples > 0:
        sampled_images = random.sample(images, required_samples)

        selected_images.update(sampled_images)
        selected_images_per_label[label].extend(sampled_images)

        for sampled_image in sampled_images: # Updating the label count for the other classes
            image_labels = csv_data[csv_data['Image Index'] == sampled_image]['Finding Labels'].values[0].split('|')
            for image_label in image_labels:
                image_label = image_label.strip()
                label_count[image_label] += 1

end = time.time()
print(f"Time taken for sampling through the csv file: {end - start} seconds")

print("Selected Images Count:")
for label, count in label_count.items():
    print(f"{label}: {count}")

print(f"Total selected images: {len(selected_images)}")

Time taken for sampling through the csv file: 98.20579957962036 seconds
Selected Images Count:
Hernia: 244
Atelectasis: 1056
Pneumothorax: 958
Nodule: 900
Consolidation: 1001
Infiltration: 1755
Pneumonia: 1071
Pleural_Thickening: 1035
Mass: 957
Effusion: 1480
Emphysema: 1046
Cardiomegaly: 960
Fibrosis: 999
Edema: 989
No Finding: 900
Total selected images: 7501


In [None]:
"""
In the previous step, the duplicates are already handled. However, let's ensure the counts and selected images per label are consistent.
I can actually change the target size if I want. Let's see how much time it takes for everything. Don't forget to a timer thingy to the
above step. For now, manually write it.

"""

selected_images_list = list(selected_images)

#final dictionary of selected images per label without duplicates
final_selected_images_per_label = defaultdict(list)
final_label_count = defaultdict(int)

for image in selected_images_list:
    image_labels = csv_data[csv_data['Image Index'] == image]['Finding Labels'].values[0].split('|')
    for image_label in image_labels:
        image_label = image_label.strip()
        final_selected_images_per_label[image_label].append(image)
        final_label_count[image_label] += 1

print("Final Selected Images Count:")
for label, count in final_label_count.items():
    print(f"{label}: {count}")

print(f"Total final selected images: {len(selected_images_list)}")


Final Selected Images Count:
No Finding: 900
Pneumonia: 962
Consolidation: 928
Fibrosis: 938
Infiltration: 1660
Mass: 896
Emphysema: 977
Pneumothorax: 897
Edema: 893
Atelectasis: 1001
Nodule: 852
Cardiomegaly: 907
Pleural_Thickening: 953
Hernia: 227
Effusion: 1401
Total final selected images: 7501


In [None]:
#adjusting sampling for the oversampled ones.

target_count_revised = 1000

filtered_df = pd.DataFrame([(img, label) for label, images in final_selected_images_per_label.items() for img in images], columns=['Image Index', 'Label'])

target_count_revised = 900
oversampled_labels = {'Infiltration', 'Effusion'}

final_selected_images = set(selected_images)
label_counts = defaultdict(int, label_count)

for label in oversampled_labels:
    images_with_label = filtered_df[
        filtered_df['Label'].str.contains(label) &
        filtered_df['Label'].apply(lambda x: set(x.split('|')).issubset(oversampled_labels))
    ]['Image Index'].unique()

    for image_id in images_with_label:
        if label_counts[label] <= target_count_revised:
            break

        final_selected_images.discard(image_id)
        label_counts[label] -= 1

print("Final Selected Images Count:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

print(f"Total selected images: {len(final_selected_images)}")


final_selected_indices_path = '/content/drive/My Drive/MLOPs Project/final_selected_indices.txt'
with open(final_selected_indices_path, 'w') as f:
    for index in final_selected_images:
        f.write(f"{index}\n")

print(f"Final selected indices saved to {final_selected_indices_path}")


Final Selected Images Count:
Hernia: 244
Atelectasis: 1056
Pneumothorax: 958
Nodule: 900
Consolidation: 1001
Infiltration: 900
Pneumonia: 1071
Pleural_Thickening: 1035
Mass: 957
Effusion: 900
Emphysema: 1046
Cardiomegaly: 960
Fibrosis: 999
Edema: 989
No Finding: 900
Total selected images: 6241
Final selected indices saved to /content/drive/My Drive/MLOPs Project/final_selected_indices.txt


# nihc Images actual sampling

In [None]:
import os
import shutil
import time
import urllib.request
import tarfile

In [None]:
def download_and_extract(zip_link, zip_file_name, extraction_folder, log_file):
    log_message = f'Downloading {zip_file_name}...\n'
    with open(log_file, 'a') as log:
        log.write(log_message)

    urllib.request.urlretrieve(zip_link, zip_file_name)

    log_message = 'Download complete.\n'
    with open(log_file, 'a') as log:
        log.write(log_message)

    log_message = f'Extracting {zip_file_name}...\n'
    with open(log_file, 'a') as log:
        log.write(log_message)

    os.makedirs(extraction_folder, exist_ok=True)
    with tarfile.open(zip_file_name, 'r:gz') as tar:
        tar.extractall(extraction_folder)

    log_message = 'Extraction complete.\n'
    with open(log_file, 'a') as log:
        log.write(log_message)


def load_selected_indices(file_path):
    with open(file_path, 'r') as f:
        return set(line.strip() for line in f)


def save_matching_images(extracted_folder, selected_indices, save_folder, log_file):
    os.makedirs(save_folder, exist_ok=True)
    saved_count = 0

    for image_id in selected_indices:
        image_name = image_id #already saved in .png format
        extra= f"images/{image_id}"
        image_path = os.path.join(extracted_folder, extra)

        if os.path.exists(image_path):
          shutil.copy(image_path, os.path.join(save_folder, image_name))
          saved_count += 1
          log_message = f'Saved image: {image_name}\n'
          with open(log_file, 'a') as log:
              log.write(log_message)

    log_message = f'Total images saved: {saved_count}\n'
    with open(log_file, 'a') as log:
        log.write(log_message)

    return saved_count


def delete_extracted_images(extraction_folder, zip_file_path, log_file):
    shutil.rmtree(extraction_folder)  # Removing the entire extracted folder - saving space. I ran out of memory while trying to download from hugging face
    log_message = f'Extracted images deleted from {extraction_folder}\n'

    if os.path.exists(zip_file_path):
        os.remove(zip_file_path)
        log_message += f'Deleted zip file: {zip_file_path}\n'
    else:
        log_message += f'Zip file not found: {zip_file_path}\n'

    with open(log_file, 'a') as log:
        log.write(log_message)


def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

In [None]:
drive_save_folder = '/content/drive/My Drive/MLOPs Project/sampled_data' #folder where the final sampled data goes

#these are mentioned in the nihc images folder in a file on how to access batches
"""zip_links = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz']"""

zip_links = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
    'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
    'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
	'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
    'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
	'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
	'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
    'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
	'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
	'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
	'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
	'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
]


selected_indices_file = '/content/drive/My Drive/MLOPs Project/final_selected_indices.txt'
selected_indices = load_selected_indices(selected_indices_file)


log_file_path = '/content/drive/My Drive/MLOPs Project/image_sampling_log.txt'

In [None]:
#os.makedirs(log_file_path, exist_ok=True) -- not required coz this is for creating a directory if it doesn't exist. Not for creating files.
#by using with open like below, it automatically creates the log text file if it's not already present.

with open(log_file_path, 'w') as log_file:
    log_file.write('Image Processing Log\n')
    log_file.write('====================\n')


for idx, zip_link in enumerate(zip_links):
    zip_file_name = f'images_{idx + 1:02d}.tar.gz'
    extraction_folder = f'/content/extracted_images_{idx + 1:02d}'

    start_time = time.time()

    download_and_extract(zip_link, zip_file_name, extraction_folder, log_file_path)
    saved_count = save_matching_images(extraction_folder, selected_indices, drive_save_folder, log_file_path)

    colab_used_space = get_directory_size('/content')
    drive_used_space = get_directory_size(drive_save_folder)

    log_message = f'Total space used in Colab: {colab_used_space / (1024 ** 2):.2f} MB\n'
    log_message += f'Total space used in Drive save folder: {drive_used_space / (1024 ** 2):.2f} MB\n'

    with open(log_file_path, 'a') as log:
        log.write(log_message)

    delete_extracted_images(extraction_folder, zip_file_name, log_file_path)

    end_time = time.time()
    elapsed_time = end_time - start_time
    log_message = f'Time taken to process {zip_file_name}: {elapsed_time:.2f} seconds\n'
    log_message += '====================\n\n'

    with open(log_file_path, 'a') as log:
        log.write(log_message)

    selected_indices = {img_id for img_id in selected_indices if f"{img_id}.png" not in os.listdir(drive_save_folder)}


final_count_message = f'Final count of images saved: {len(os.listdir(drive_save_folder))}\n'
with open(log_file_path, 'a') as log:
    log.write(final_count_message)


In [None]:
#12th file download code, as it was doing an infinite loop previously.
import os
import urllib.request
import tarfile
import shutil

selected_indices_path = '/content/drive/My Drive/MLOPs Project/final_selected_indices.txt'
save_folder = '/content/drive/My Drive/MLOPs Project/sampled_data'
os.makedirs(save_folder, exist_ok=True)

with open(selected_indices_path, 'r') as file:
    selected_indices = set(line.strip() for line in file)

last_url = 'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
zip_file_name = 'images_12.tar.gz'

print(f'Downloading {zip_file_name}...')
urllib.request.urlretrieve(last_url, zip_file_name)

print(f'Extracting {zip_file_name}...')
with tarfile.open(zip_file_name, 'r:gz') as tar:
    tar.extractall('/content/temp_images')


extracted_folder = '/content/temp_images/images'
for image_name in os.listdir(extracted_folder):
    if image_name in selected_indices:
        shutil.copy(os.path.join(extracted_folder, image_name), os.path.join(save_folder, image_name))
        selected_indices.remove(image_name)


shutil.rmtree('/content/temp_images')
os.remove(zip_file_name)

print(f'Finished processing {zip_file_name}. Remaining images to save: {len(selected_indices)}')
print("Process complete.")


Downloading images_12.tar.gz...
Extracting images_12.tar.gz...
Finished processing images_12.tar.gz. Remaining images to save: 5852
Process complete.


In [None]:
#debugging things

def list_extracted_files(extracted_folder):
  print("Available files in extracted folder:")
  extracted_files = os.listdir(extracted_folder)

  i= 0
  for filename in extracted_files:
    file_path = os.path.join(extracted_folder, filename)
    i += 1
    if i < 5:
      print("filename:", filename)
      print(file_path)


    if filename == "00000782_000.png":
        print(f"Found specific file: {file_path}")


list_extracted_files('/content/extracted_images_01/images')


Available files in extracted folder:
filename: 00000137_001.png
/content/extracted_images_01/images/00000137_001.png
filename: 00001029_016.png
/content/extracted_images_01/images/00001029_016.png
filename: 00001187_005.png
/content/extracted_images_01/images/00001187_005.png
filename: 00001092_000.png
/content/extracted_images_01/images/00001092_000.png
Found specific file: /content/extracted_images_01/images/00000782_000.png


# Final csv with sampeled data labels

In [None]:
import os
import csv

In [None]:
saved_folder= '/content/drive/My Drive/MLOPs Project/sampled_data'
saved_images_list= os.listdir(saved_folder)
saved_images_list.sort()

csv_file_path = '/content/drive/My Drive/MLOPs Project/Data_Entry.csv'
csv_data = pd.read_csv(csv_file_path)

In [None]:
all_images= csv_data['Image Index'].to_list()
all_labels= csv_data['Finding Labels'].to_list()

one_hot_encoding= {
    'No Finding': 0,
    'Atelectasis': 1,
    'Cardiomegaly': 2,
    'Effusion': 3,
    'Infiltration': 4,
    'Mass': 5,
    'Nodule': 6,
    'Pneumonia': 7,
    'Pneumothorax': 8,
    'Consolidation': 9,
    'Edema': 10,
    'Emphysema': 11,
    'Fibrosis': 12,
    'Pleural_Thickening': 13,
    'Hernia': 14
}

saved_labels_list= []
i1= 0
for i in saved_images_list:
  if i in all_images:
    i1+=1
    labels= all_labels[all_images.index(i)].strip().split('|')
    encoded_labels= [one_hot_encoding[i] for i in labels]
    saved_labels_list.append(encoded_labels)
print("mathced images:", i1)

rows= zip(saved_images_list, saved_labels_list)
new_csv_path= '/content/drive/My Drive/MLOPs Project/sampled_data_entry.csv'

with open(new_csv_path, 'w') as infile:
  writer= csv.writer(infile)
  writer.writerow(["Image Index", "Labels"])
  writer.writerows(rows)



In [None]:
all_images = csv_data['Image Index'].to_list()
all_labels = csv_data['Finding Labels'].to_list()
all_genders = csv_data['Patient Gender'].to_list()
all_ages = csv_data['Patient Age'].to_list()


one_hot_encoding = {
    'No Finding': 0,
    'Atelectasis': 1,
    'Cardiomegaly': 2,
    'Effusion': 3,
    'Infiltration': 4,
    'Mass': 5,
    'Nodule': 6,
    'Pneumonia': 7,
    'Pneumothorax': 8,
    'Consolidation': 9,
    'Edema': 10,
    'Emphysema': 11,
    'Fibrosis': 12,
    'Pleural_Thickening': 13,
    'Hernia': 14
}

saved_data_list = []
i1 = 0


for i in saved_images_list:
    if i in all_images:
        i1 += 1
        index = all_images.index(i)
        labels = all_labels[index].strip().split('|')
        gender = all_genders[index]
        age = all_ages[index]

        encoded_labels = [one_hot_encoding[label] for label in labels]

        one_hot_vector = [0] * len(one_hot_encoding)
        for label in encoded_labels:
            one_hot_vector[label] = 1

        saved_data_list.append([i, labels, gender, age, one_hot_vector])

print("Matched images:", i1)

new_csv_path = '/content/drive/My Drive/MLOPs Project/sampled_data_entry.csv'

with open(new_csv_path, 'w', newline='') as infile:
    writer = csv.writer(infile)
    writer.writerow(["Image Index", "Labels", "Gender", "Age", "One-Hot Encoding"])
    writer.writerows(saved_data_list)

print(f"CSV file saved at {new_csv_path}")


Matched images: 6241
CSV file saved at /content/drive/My Drive/MLOPs Project/sampled_data_entry.csv


# Creating Label - Indices file

In [None]:
import csv
import json
from ast import literal_eval

In [None]:
new_csv_path= '/content/drive/My Drive/MLOPs Project/sampled_data_entry.csv'
json_path= '/content/drive/My Drive/MLOPs Project/labels_to_indices.json'

label_to_indices = {}

with open(new_csv_path, "r") as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        index = row["Image Index"]
        labels = literal_eval(row["Labels"])  # Converts string representation of list to an actual list

        for label in labels:
            if label not in label_to_indices:
                label_to_indices[label] = []
            label_to_indices[label].append(index)


with open(json_path, mode="w") as json_file:
    json.dump(label_to_indices, json_file, indent=4)