In [2]:
# EDA on MER Zip from Azure Blob
from azure.storage.blob import BlobServiceClient
from huggingface_hub import hf_hub_download
from pathlib import Path
import zipfile, random, os
from dotenv import load_dotenv

# pip install adlfs requests tqdm
import os, requests
from tqdm import tqdm
import adlfs

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load from .env
load_dotenv()

CONN_STR  = os.getenv("AZURE_CONN_STR")
CONTAINER = os.getenv("AZURE_CONTAINER", "merdata-23")
BLOB_NAME = os.getenv("AZURE_BLOB", "mer2023train.zip")
ACCOUNT_NAME = os.getenv("AZURE_ACCOUNT_NAME")      
ACCOUNT_KEY  = os.getenv("AZURE_ACCOUNT_KEY")      
HF_TOKEN   = os.getenv("HF_TOKEN")

print("Container:", CONTAINER)
print("Blob:", BLOB_NAME)
print("ACCOUNT NAME:", ACCOUNT_NAME)

Container: merdata-23
Blob: mer2023train.zip
ACCOUNT NAME: mymlprojectsstorage


In [4]:
# Hugging Face raw URLs (add all parts you need)
HF_BASE = "https://huggingface.co/datasets/MERChallenge/MER2023/resolve/main/"
FILES = [
    "mer2023train.z01", "mer2023train.z02", "mer2023train.z03",
    "mer2023train.z04", "mer2023train.z05", "mer2023train.z06",
    "mer2023train.zip",
]

In [6]:
#Acess Azure storage account and then upload files there 

fs = adlfs.AzureBlobFileSystem(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY)

def copy_http_to_blob(http_url: str, blob_path: str, chunk=1024*1024):
    headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
    with requests.get(http_url, headers=headers, stream=True, allow_redirects=True) as r:
        r.raise_for_status()
        with fs.open(blob_path, "wb") as out:
            for c in tqdm(r.iter_content(chunk_size=chunk), desc=os.path.basename(blob_path)):
                if c:
                    out.write(c)

for fname in FILES:
    src = HF_BASE + fname
    dst = f"az://{CONTAINER}/{fname}"
    print("Uploading →", dst)
    copy_http_to_blob(src, dst)

print("✅ Uploaded all parts directly to Blob")



Uploading → az://merdata-23/mer2023train.z01


mer2023train.z01: 17545it [1:22:19,  3.55it/s]


ChunkedEncodingError: ('Connection broken: IncompleteRead(18398227132 bytes read, 3076609348 more expected)', IncompleteRead(18398227132 bytes read, 3076609348 more expected))

In [14]:
# --- 1. Download if not cached ---
def download_blob():
    if LOCAL_ZIP.exists():
        print(f"[cache hit] {LOCAL_ZIP}")
        return LOCAL_ZIP
    LOCAL_ZIP.parent.mkdir(parents=True, exist_ok=True)
    svc = BlobServiceClient.from_connection_string(CONN_STR)
    cont = svc.get_container_client(CONTAINER)
    blob = cont.get_blob_client(BLOB_NAME)
    with open(LOCAL_ZIP, "wb") as f:
        f.write(blob.download_blob().readall())
    print(f"[downloaded] {BLOB_NAME} -> {LOCAL_ZIP}")
    return LOCAL_ZIP

zip_path = download_blob()


[cache hit] data/raw/mer2023train.zip


In [15]:
zip_path = Path("data/raw/mer2023train.zip")

with zipfile.ZipFile(zip_path, "r") as zf:
    all_files = zf.namelist()
    print("Total files:", len(all_files))
    print("First 20:", all_files[:20])

BadZipFile: zipfiles that span multiple disks are not supported

In [None]:
import subprocess, re, random
from pathlib import Path

def sample_files(files, ext, k=3):
    subset = [f for f in files if f.lower().endswith(ext)]
    return random.sample(subset, min(k, len(subset)))


txts = sample_files(all_files, ".txt")
wavs = sample_files(all_files, ".wav")
mp4s = sample_files(all_files, ".mp4")

print("Sample TXT:", txts)
print("Sample WAV:", wavs)
print("Sample MP4:", mp4s)


In [None]:
sample_dir = Path("data/sample")
sample_dir.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zf:
    for f in txts[:1] + wavs[:1] + mp4s[:1]:
        zf.extract(f, path=sample_dir)
        print("Extracted:", f)


In [None]:
# --- 4. Peek inside a transcript without extracting all ---
with zipfile.ZipFile(zip_path, "r") as zf:
    with zf.open(txt_files[0]) as f:
        content = f.read().decode("utf-8", errors="ignore")
        print("Transcript preview:", content[:300])


In [None]:
# --- 5. Extract just one audio + one video for inspection ---
extract_dir = Path("data/sample")
extract_dir.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zf:
    for fname in wav_files[:1] + mp4_files[:1]:
        zf.extract(fname, path=extract_dir)
        print(f"Extracted {fname} to {extract_dir}")