#Cleaning File Names

In [None]:
import pandas as pd
from google.colab import drive
import os
import zipfile
from tqdm import tqdm
# This will prompt for authorization to access your Google Drive from Colab.
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Set the path to the directory where the CSV files are located
folder_path = '/content/drive/My Drive/Capstone/Data/'

data = pd.read_csv(f'{folder_path}/Data_Entry_2017_v2020.csv')

In [None]:
data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,38,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,28,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,29,F,PA,2048,2500,0.168,0.168


## Extracting the resized_images.zip


In [None]:
# Define the path for the zipped file and extraction path
zip_file_path = '/content/drive/My Drive/Capstone/Data/final_images.zip'
extraction_path = '/content/drive/My Drive/Capstone/Data/Images'

# Open the zipped file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Get a list of its contents
    zip_contents = zip_ref.infolist()

    # Set up tqdm to display a progress bar
    for file in tqdm(zip_contents, desc='Extracting files'):
        # Extract each file to the specified directory
        zip_ref.extract(file, extraction_path)

Extracting files: 100%|██████████| 112121/112121 [10:56<00:00, 170.85it/s]


## Renaming the file names by removing the first three characters

In [None]:
# #Function to rename the files by removing the first three characters
directory_path = '/content/drive/My Drive/Capstone/Data/Images/resized_images'
# def rename_files(directory_path):
#     # Rename the files in the directory
#     for filename in os.listdir(directory_path):
#         # Check if the filename length is 19 characters
#         if len(filename) == 19:
#             # Compute the new name by removing the first three characters
#             new_name = filename[3:]
#             # Compute the full old and new file paths
#             old_file_path = os.path.join(directory_path, filename)
#             new_file_path = os.path.join(directory_path, new_name)
#             # Rename the file
#             os.rename(old_file_path, new_file_path)
#             print(f"Renamed {filename} to {new_name}")

# # Call the rename function
# rename_files(directory_path)

In [None]:
# Function to list all files in a directory
def list_files_in_directory(directory):
    return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]


In [None]:
google_drive_filenames = list_files_in_directory(directory_path)

In [None]:
len(google_drive_filenames)

112120

In [None]:
sortedfiles = sorted(google_drive_filenames)

In [None]:
image_index_list = data['Image Index'].tolist()

# Compare the two lists and find differences
sortedfiles_set = set(sortedfiles)
image_index_set = set(image_index_list)

# Files in sortedfiles not in Image Index
missing_in_image_index = sortedfiles_set - image_index_set

# Files in Image Index not in sortedfiles
missing_in_sortedfiles = image_index_set - sortedfiles_set

In [None]:
missing_in_image_index

set()

In [None]:
missing_in_sortedfiles

set()

In [None]:
data.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168


##Verification for any inconsistencies

In [None]:
import os
import pandas as pd


image_directory_path = '/content/drive/My Drive/Capstone/Data/Images/resized_images'

# Get the list of all files in the image directory
existing_files = set(os.listdir(image_directory_path))

# Verify if each file in the 'Image Index' column exists in the directory
missing_files = []
for image_name in data['Image Index']:
    if image_name not in existing_files:
        missing_files.append(image_name)

# Print the results
if missing_files:
    print(f"The following files are missing: {missing_files}")
else:
    print("All files are present in the directory.")


All files are present in the directory.
