# Benchmarking Speed

Depending on hardware, results may differ, but hashes/s is typically more than 100,000.

In [2]:
import csv, secrets, pathlib
from typing import List

def generate_dummy_csvs(
    folder: str | pathlib.Path = "data",
    *,
    num_files: int = 20,
    rows_per_file: int = 100,
    id_length: int = 7,
    header: str = "ID",
) -> List[pathlib.Path]:
    """
    Create *folder* (if absent) and fill it with `num_files` CSV files.
    Each file has `rows_per_file` random IDs of fixed length `id_length`.
    
    Returns a list of the CSV paths written.
    
    Example
    -------
    >>> generate_dummy_csvs(num_files=3, rows_per_file=10)
    [PosixPath('data/file_01.csv'), PosixPath('data/file_02.csv'), ...]
    """
    folder = pathlib.Path(folder)
    folder.mkdir(parents=True, exist_ok=True)

    written: list[pathlib.Path] = []
    fmt = "{:0" + str(id_length) + "d}"          # zero-pad to fixed width

    for i in range(1, num_files + 1):
        path = folder / f"file_{i:02d}.csv"
        with path.open("w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([header])            # header row

            for _ in range(rows_per_file):
                rand_id = secrets.randbelow(10 ** id_length)
                writer.writerow([fmt.format(rand_id)])

        written.append(path)

    return written



In [None]:
import secrets, string, time

from hash_ids import hash_student_id
from hash_csvs import hash_all_csvs

_ALPHA_NUM = string.ascii_uppercase + string.digits   # 36-char alphabet



def _random_id_8() -> str:
    """Return an 8-char, crypto-strong, alphanumeric ID like '2G9K4TQZ'."""
    return ''.join(secrets.choice(_ALPHA_NUM) for _ in range(8))

def benchmark(bench_func, iterations: int = 10_000, *args) -> None:
    """
    Generate one random 8-char ID, hash it `iterations` times,
    and print throughput stats.
    """
    t0 = time.perf_counter()
    print(iterations)
    for _ in range(iterations):
        bench_func(*args)        # uses the existing pepper
    elapsed = time.perf_counter() - t0
    print(f"Iterations : {iterations:,}")
    print(f"Elapsed    : {elapsed:.4f} s")
    print(f"iterations/s   : {iterations/elapsed:,.0f}")




if __name__ == "__main__":

    id = _random_id_8()
    generate_dummy_csvs()
    benchmark(hash_all_csvs, 1, "data")

1
Iterations : 1
Elapsed    : 3.3690 s
iterations/s   : 0


# Hashing data

In [None]:
from hash_csvs import hash_column_in_csv, hash_all_csvs

# 1. One file — overwrite it in place
hash_column_in_csv("Example_ULA_Applications_dummy.csv", id_column="ID")


# # 2. Whole folder — produce *_hashed.csv next to each source
hash_all_csvs("data", id_column="student_id")

PosixPath('Example_ULA_Applications_dummy.csv')