In [None]:
import pathlib
import zipline
import tarfile

In [ ]:
def file_bytes_generator(location, maxsize, return_filename=True):
    # yields (name,bytes) for files smaller than `maxsize` and that begin with b'MZ'
    # works for tar (including bz2, gz), zip, and directories
    path = pathlib.Path(location)
    if path.is_file():
        if location.lower().endswith('.zip'):
            pwd_ix = 0
            with zipfile.ZipFile(location, 'r') as f:
                for info in f.infolist():
                    if info.file_size <= maxsize:
                        while True:
                            try:
                                content = f.read(
                                    info.filename, pwd=ZIP_PASSWORDS[pwd_ix])
                            except RuntimeError:
                                pwd_ix += 1
                                if pwd_ix >= len(ZIP_PASSWORDS):
                                    raise Exception(
                                        f"Unable to guess ZIP encryption passwords for {location}")
                            else:
                                break

                        if content.startswith(b'MZ'):
                            yield (os.path.join(location, info.filename), content) if return_filename else content

        elif location.lower().endswith('.tar') or location.lower().endswith('.tar.bz2') or location.lower().endswith('.tar.gz') or location.lower().endswith('.tgz'):
            with tarfile.open(location, mode='r') as tar:
                for member in tar:
                    if member.size <= maxsize:
                        f = tar.extractfile(member)
                        if f:
                            content = f.read()
                            if content.startswith(b'MZ'):
                                yield (os.path.join(location, member.name), content) if return_filename else content

    elif path.is_dir():
        for filepath in path.glob('*'):
            fileobj = pathlib.Path(filepath)
            if fileobj.is_file() and fileobj.stat().st_size <= maxsize:
                try:
                    with open(filepath, 'rb') as infile:
                        content = infile.read()
                        if content.startswith(b'MZ'):
                            yield (fileobj.absolute().name, content) if return_filename else content
                except PermissionError:
                    continue