In [0]:
import zipfile
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

In [0]:
%sql

use catalog google_fit


In [0]:
%sql
create schema if not exists google_fit.bronze

In [0]:
%sql
use schema bronze

In [0]:
%sql

create volume if not exists landing_zone

In [0]:
EXTERNAL_LOC = "s3://google-fit-data-raw-12e34974-eb44-4fb4-ad47-49a2b7455430"
VOLUME_PATH = "/Volumes/google_fit/bronze/landing_zone/"

In [0]:
def from_ext_s3_to_landing_zone():
    global EXTERNAL_LOC
    global VOLUME_PATH
    try:
        for file_info in dbutils.fs.ls(EXTERNAL_LOC):
            if(re.search("takeout.*zip", file_info.path)):
                dbutils.fs.mv(file_info.path, VOLUME_PATH)
    except Exception as e:
        print(e)

def unzip_file(file_path, output_path):
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        print(f"Started extracting files from the path : {file_path}")
        print(f"Extracting to the path: {output_path}")
        if(dbutils.fs.ls(output_path)):
            print(f"Extracted files already exits for the file :{file_path}")
            return
        zip_ref.extractall(output_path)

def create_io_config_for_zip_files():
    io_config_for_zip_files = dict()
    for fileinfo in dbutils.fs.ls(VOLUME_PATH):
        if fileinfo.path.split("/")[-1].endswith(".zip"):
            io_config_for_zip_files[fileinfo.path.replace("dbfs:", "")] = VOLUME_PATH + fileinfo.path.split("/")[-1].split(".")[0]  + "/extracted/"
    return io_config_for_zip_files

def extract_all_zip_files(io_config_for_zip_files: dict):
    futures = []
    with ThreadPoolExecutor(len(io_config_for_zip_files)) as e:
        for file_path, output_path in io_config_for_zip_files.items():
            futures.append(e.submit(unzip_file, file_path, output_path))
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(e)



In [0]:
def main():
  from_ext_s3_to_landing_zone()
  extract_all_zip_files(create_io_config_for_zip_files())

In [0]:
main()