In [None]:
import urllib.request
import gzip
import shutil
import uuid
import os
from datetime import datetime
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobBlock

CHUNK_SIZE = 4 * 1024 * 1024  

account_url = "https://storageaccimdbdl.blob.core.windows.net"
credential = DefaultAzureCredential()

container_name = "bronze"
blob_service_client = BlobServiceClient(account_url, credential=credential)
container_client = blob_service_client.get_container_client(container=container_name)

today_date = datetime.today().strftime('%d-%m-%Y')
download_path = f"./tmp/{today_date}/"
os.makedirs(download_path, exist_ok=True)

imdb_dataset_domain = "https://datasets.imdbws.com/"
all_dataset_filenames = [
    "name.basics.tsv", "title.akas.tsv", "title.basics.tsv",
    "title.episode.tsv", "title.principals.tsv", "title.ratings.tsv"
]

def decompress_gz_to_file(gz_path, output_path):
    with gzip.open(gz_path, 'rb') as f_in, open(output_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out, length=1024 * 1024)  

def upload_large_file_in_chunks(blob_client, file_path):
    block_ids = []

    with open(file_path, 'rb') as file:
        block_num = 0
        while True:
            chunk = file.read(CHUNK_SIZE)
            if not chunk:
                break

            block_id = str(uuid.uuid4())
            block_ids.append(BlobBlock(block_id=block_id))
            blob_client.stage_block(block_id=block_id, data=chunk)

            block_num += 1
            print(f"Uploaded block {block_num} ({len(chunk)} bytes)")

    blob_client.commit_block_list(block_ids)
    print(f"Completed upload: {file_path}")

for filename in all_dataset_filenames:
    gz_path = os.path.join(download_path, f"{filename}.gz")
    tsv_path = os.path.join(download_path, filename)

    urllib.request.urlretrieve(f"{imdb_dataset_domain}{filename}.gz", gz_path)

    decompress_gz_to_file(gz_path, tsv_path)

    blob_client = container_client.get_blob_client(blob=(today_date + "/" + filename))
    upload_large_file_in_chunks(blob_client, tsv_path)

    os.remove(gz_path)
    os.remove(tsv_path)

shutil.rmtree("./tmp")