In [1]:
import hashlib
import os
import pathlib
import tarfile
import urllib.request

In [2]:
# A SHA-256 hash is a 64-character hexadecimal string uniquely representing the contents of a file.


def comp_sha256(file_name):
    hash_sha256 = hashlib.sha256()
    with pathlib.Path(file_name).open(mode="rb") as f_temp: # opening the file in binary mode (rb - read binary)
        for _seq in iter(lambda: f_temp.read(65536), b""):  # reading file in chunks (65,536 bytes - 64kb) per time, it avoids loading the whole file into the RAM.
            hash_sha256.update(_seq)
    sha256_res = hash_sha256.hexdigest()
    return sha256_res

In [3]:
# Creating a level 0 directory
core_path = os.getcwd()
pathlib.Path(os.path.join(core_path, "data/lvl0/")).mkdir(parents=True, exist_ok=True)

In [4]:
# Setting up a dictionary that contains the taxonomy classifications data and corresponding SHA256 values

files_to_dl = \
    {'file1': {'url': 'http://smass.mit.edu/data/smass/Bus.Taxonomy.txt',
               'sha256': '0ce970a6972dd7c49d512848b9736d00b621c9d6395a035bd1b4f3780d4b56c6'},
     'file2': {'url': 'http://smass.mit.edu/data/smass/smass2data.tar.gz',
               'sha256': 'dacf575eb1403c08bdfbffcd5dbfe12503a588e09b04ed19cc4572584a57fa97'}}

In [None]:
# iterating through the dictionary to download the files
for dl_key in files_to_dl:
    # splitting the url and parsing it on basis of '/'
    split = urllib.parse.urlsplit(files_to_dl[dl_key]["url"])
    filename = pathlib.Path(os.path.join(core_path, "data/lvl0/", split.path.split("/")[-1]))

    # if file is not found, then download it
    if not filename.is_file():
        print(f"Downloading the file now: {files_to_dl[dl_key]['url']}")

        # downloading the file and reteriving the location path
        downl_file_path, _= urllib.request.urlretrieve(url=files_to_dl[dl_key]['url'], filename=filename)
        
        # checking for the hash of the files to compare
        tax_hash = comp_sha256(downl_file_path)
        assert tax_hash == files_to_dl[dl_key]["sha256"]

Downloading the file now: http://smass.mit.edu/data/smass/Bus.Taxonomy.txt
Downloading the file now: http://smass.mit.edu/data/smass/smass2data.tar.gz


In [None]:
# Unzipping the tar files.

tar = tarfile.open(os.path.join(core_path, "data/lvl0/", "smass2data.tar.gz"), "r:gz")
tar.extractall(os.path.join(core_path, "data/lvl0/"))
tar.close

<bound method TarFile.close of <tarfile.TarFile object at 0x000001FF7E592CF0>>