In [2]:
import hashlib
import pandas as pd

from pathlib import Path
from tqdm.auto import tqdm

In [4]:
def md5(file_path):
    hash_md5 = hashlib.md5()
    with Path(file_path).open(mode='rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

img_dirs = [
    '/media/jonas/Windhager/DeepIMC/datasets/basel/tiff',
    '/media/jonas/Windhager/DeepIMC/datasets/zurich/tiff',
]
mask_dirs = [
    '/media/jonas/Windhager/DeepIMC/datasets/basel/masks',
    '/media/jonas/Windhager/DeepIMC/datasets/zurich/masks',
]

img_file_paths = [img_file_path for img_dir in img_dirs for img_file_path in sorted(Path(img_dir).iterdir())]
img_hashes = pd.DataFrame(data={
    'file': [img_file_path.name for img_file_path in img_file_paths],
    'md5': [md5(img_file_path) for img_file_path in tqdm(img_file_paths)],
})
img_hashes.to_csv('img_hashes.csv', index=False)

mask_file_paths = [mask_file_path for mask_dir in mask_dirs for mask_file_path in sorted(Path(mask_dir).iterdir())]
mask_hashes = pd.DataFrame(data={
    'file': [mask_file_path.name for mask_file_path in mask_file_paths],
    'md5': [md5(mask_file_path) for mask_file_path in tqdm(mask_file_paths)],
})
mask_hashes.to_csv('mask_hashes.csv', index=False)

100%|██████████| 418/418 [04:05<00:00,  1.70it/s]
100%|██████████| 418/418 [00:06<00:00, 68.00it/s] 


In [1]:

def md5_of_jonas_img(img_file_path):
    img_file_name = _fix_file_name(Path(img_file_path).name)
    df = pd.read_csv('img_hashes.csv', index_col='file')
    if img_file_name in df.index:
        return df.loc[img_file_name, 'md5']
    return 'skipping the file'

def md5_of_jonas_masks(mask_file_path):
    mask_file_name = _fix_file_name(Path(mask_file_path).name)
    df = pd.read_csv('mask_hashes.csv', index_col='file')
    if mask_file_name in df.index:
        return df.loc[mask_file_name, 'md5']
    return 'skipping the file'

def _fix_file_name(file_name):
    if file_name.startswith('BaselTMA'):
        return _fix_basel_file_name(file_name)
    if file_name.startswith('ZTMA208'):
        return _fix_zurich_file_name(file_name)
    raise ValueError('Unknown file name')

def _fix_basel_file_name(file_name):
    file_name = file_name.replace('_20179015_', '_20170915_')
    file_name = file_name.replace('_a0_full.tiff', '.tiff')
    file_name = file_name.replace('_a0_full_maks.tiff', '_mask.tiff')
    return file_name

def _fix_zurich_file_name(file_name):
    file_name = file_name.replace('_a0_full.tiff', '.tiff')
    file_name = file_name.replace('_a0_full_maks.tiff', '_mask.tiff')
    return file_name