# Use Python to Generate Checksums 
Presented with a whole directory of files, how would one generate checksums for each file, AND maintain the filestructure in a format conducive to subsequently compare the manifest and then download only the appropriate files?

This function, with 4 common libraries, shows how to checksum 1490 files in under 3 seconds. We then output a pair of lists generated into a pandas data frame, and then save that dataframe as a CSV for further use.



In [1]:
import pandas as pd 
import hashlib
import os
import datetime


def generate_checksums(directory):
    s = datetime.datetime.now()
    i = 0
    filer = []
    outputsm , output = [],[]
    """Generates checksums for all files in a directory and its subdirectories."""

    for root, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)

            # Calculate the checksum (using SHA-1 in this example - less secure, but more common)
            hashersm = hashlib.sha1()
            # Calculate the checksum (using SHA-256 in this example - more complex and collision resistant)
            hasher = hashlib.sha256()
            with open(filepath, 'rb') as f:
                while True:
                    chunk = f.read(10000000000)  # Read file in chunks
                    if not chunk:
                        break
                    hashersm.update(chunk)
                    hasher.update(chunk)
                    filer.append(filepath)
                    outputsm.append(hashersm.hexdigest())
                    output.append(hasher.hexdigest())
                    
            #print(f"{filepath}: {hasher.hexdigest()}")
    print("Duration in Seconds: ", datetime.datetime.now()-s)
    return pd.DataFrame({'filename':filer, 'hash_SHA1':outputsm, 'hash_SHA256':output})
if __name__ == "__main__":
    directory_to_scan = "/kaggle/input/traffic-signs-dataset-in-yolo-format"
    f = generate_checksums(directory_to_scan)
    f.to_csv('fileout.csv')

Duration in Seconds:  0:00:08.651496


In [2]:
# let's see how many files were processed
f.shape[0]

1490

In [3]:
# what does the output file look like?
!head fileout.csv

,filename,hash_SHA1,hash_SHA256
0,/kaggle/input/traffic-signs-dataset-in-yolo-format/yolov3_ts_train.cfg,b0c01731b0a62279ba762628e8d0e8d07723cf83,cdf1f1d65f81875e3378fc4de80a97b61f066dc955bf67a320dc022f3b503388
1,/kaggle/input/traffic-signs-dataset-in-yolo-format/getting-full-path.py,39af898b7f324be95764cdabd6eb5f2c3b4d27f8,4cd280a8e57e25736ff01d0238f96bcbe985c1fb0ff30df745c35ec01f43e215
2,/kaggle/input/traffic-signs-dataset-in-yolo-format/test.txt,7bed41b39d419650091a04a790d1f16aea24bc42,e6b24c8f74469a4ad790f6b136540fa4bc38e4be0b695fac0c9159587fa1a6c9
3,/kaggle/input/traffic-signs-dataset-in-yolo-format/ts_data.data,c019c9ac21bb511b9cb236db5555386a112791f6,4082abba3abea1765ce1817154b6344b61e89af09424a290be295123b576b32e
4,/kaggle/input/traffic-signs-dataset-in-yolo-format/train.txt,8fe225c4f9cdd8c2a515fb8384e4f56b28f7cac3,3ed48dacf41e2918815eede88fa51153c243e71f8ef1f085cc815f055f5ae871
5,/kaggle/input/traffic-signs-dataset-in-yolo-format/traffic-sign-to-test.mp4,14533d9287dc4212