In [None]:
import hashlib
import os
from itertools import groupby
from pathlib import Path

from icecream import ic
from tqdm import tqdm

In [None]:
root_path = Path(os.getcwd())  # TODO Change it to your Path
root_path

In [317]:
def get_file_hash(path) -> str:
    hash_md5 = hashlib.md5()
    with open(path, 'rb') as f:
        for chunk in iter(lambda: f.read(2 ** 13), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [318]:
def filter_and_print(results):
    filtered_results = dict(
        filter(
            lambda result:
            len(result[1]) > 1 and
            result[1][0].stat().st_size > 0,
            results.items()
        )
    )
    for _, selected_paths in filtered_results.items():
        ic(selected_paths)


In [319]:
paths: list = [
    path for path in root_path.glob("**/*")
    if path.is_file() and
       not any([parent.name.startswith(".") for parent in path.parents]) and
       not path.name.startswith(".")
]

# Naive Solution (Slow)

1. Hash every file
2. remove all groups with one file
3. print out dublicates

In [None]:
results = {}
for path in tqdm(paths):
    p_hash = get_file_hash(path)
    if p_hash not in results:
        results[p_hash] = []
    results[p_hash] += [path]

filter_and_print(results)

# Complex Solution (Fast)

1. Sort by size
2. Group by size (files that have the same content also have the same size)
3. Group by suffix (we assume that suffix is the same and name could be changed)
4. Remove all groups with one file
5. Calculate hash and print all found dublicates

Step 1-4 are cheap and reduce the amount of files that are hashed

In [321]:
sorted_paths = sorted(paths, key=lambda path: path.stat().st_size)

grouped_by_size = groupby(sorted_paths, key=lambda path: path.stat().st_size)
size_to_paths_map = {size: list(paths_group) for size, paths_group in grouped_by_size}
duplicate_size_groups = list(
    filter(lambda group: len(group[1]) > 1, size_to_paths_map.items())
)

hash_to_paths_map = {}

for _, paths_with_same_size in tqdm(duplicate_size_groups):
    for file_path in paths_with_same_size:
        file_hash = get_file_hash(file_path)
        if file_hash not in hash_to_paths_map:
            hash_to_paths_map[file_hash] = []
        hash_to_paths_map[file_hash].append(file_path)

filter_and_print(hash_to_paths_map)

100%|██████████| 6/6 [00:00<00:00, 141.52it/s]
