In [1]:
import pandas as pd
dataset = pd.read_csv('data.csv')
dataset.head()

Unnamed: 0,wnid,label
0,n00004475,"organism, being"
1,n00005787,benthos
2,n00006024,heterotroph
3,n00006484,cell
4,n00007846,"person, individual, someone, somebody, mortal,..."


In [2]:
import os
import tarfile
from tqdm import tqdm
from PIL import Image
import io

def extract_tar(part_id: str, tar_path: str, target_path: str = None) -> None:
    """
    Extracts a .tar file into a structured directory and resizes all images to 256x256.

    If target_path is given:
        -> target_path/extracted/<part_id>/
    Otherwise:
        -> <directory_of_tar>/extracted/<part_id>/

    Args:
        part_id (str): ImageNet class ID (e.g., 'n02124075')
        tar_path (str): Path to the .tar file
        target_path (str, optional): Root directory for extraction
    """
    # Determine base extraction directory
    if target_path and os.path.isabs(target_path):
        base_extract_dir = os.path.join(target_path, "extracted")
    else:
        base_extract_dir = os.path.join(os.path.dirname(tar_path), "extracted")

    dest_folder = os.path.join(base_extract_dir, part_id)
    os.makedirs(dest_folder, exist_ok=True)

    try:
        with tarfile.open(tar_path, "r:*") as tar:
            members = [m for m in tar.getmembers() if m.isfile()]
            total_files = len(members)

            with tqdm(total=total_files, unit="files", desc=f"Extracting {part_id}", ncols=100, leave=False) as pbar:
                for member in members:
                    member_path = os.path.join(dest_folder, member.name)

                    # Security: block unsafe paths
                    if not os.path.commonpath([dest_folder, member_path]).startswith(dest_folder):
                        continue

                    # Extract file into memory
                    extracted = tar.extractfile(member)
                    if extracted is None:
                        pbar.update(1)
                        continue

                    # Handle images safely
                    try:
                        img = Image.open(io.BytesIO(extracted.read()))
                        img = img.convert("RGB")
                        img = img.resize((256, 256), Image.Resampling.LANCZOS)

                        # Ensure parent folder exists
                        os.makedirs(os.path.dirname(member_path), exist_ok=True)
                        img.save(member_path, format="JPEG", quality=95)
                    except Exception:
                        # Fallback: just write raw file if not an image
                        os.makedirs(os.path.dirname(member_path), exist_ok=True)
                        with open(member_path, "wb") as f:
                            extracted.seek(0)
                            f.write(extracted.read())

                    pbar.update(1)
    except Exception as e:
        print(f"âœ— Failed to extract {tar_path}: {e}")

In [3]:
import os
import requests
from tqdm import tqdm

def download_tar(parent_dir: str, main_folder_name: str, part_id: str, main_link="https://image-net.org/data/winter21_whole") -> str:
    """
    Downloads a .tar file from ImageNet (winter21_whole) into a new folder inside parent_dir.
    Displays a tqdm progress bar and logs any failed downloads.

    Args:
        parent_dir (str): Path to the parent directory.
        main_folder_name (str): Name of the folder to create inside parent_dir.
        part_id (str): ImageNet part ID (e.g., 'n02124075').
        main_link (str): Base URL of ImageNet archive.

    Returns:
        str: Path to the downloaded .tar file if successful, None if failed.
    """
    dest_folder = os.path.join(parent_dir, main_folder_name)
    os.makedirs(dest_folder, exist_ok=True)
    log_file = os.path.join(parent_dir, "failed_downloads.log")

    tar_url = f"{main_link}/{part_id}.tar"
    filename = f"{part_id}.tar"
    tar_path = os.path.join(dest_folder, filename)

    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/122.0 Safari/537.36",
        "Accept": "application/octet-stream",
        "Connection": "keep-alive",
    }

    try:
        with requests.get(tar_url, stream=True, timeout=60, headers=HEADERS) as r:
            r.raise_for_status()

            total_size = int(r.headers.get("Content-Length", 0))
            chunk_size = 65536

            with open(tar_path, "wb") as f, tqdm(
                total=total_size,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
                desc=f"Downloading {part_id}",
                leave=False,
                ncols=100,
            ) as pbar:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        pbar.update(len(chunk))
        extract_tar(part_id, tar_path)
            
        return tar_path

    except Exception as e:
        with open(log_file, "a") as log:
            log.write(f"{part_id} - {str(e)}\n")
        print(f"Failed to download {part_id}: {e}")
        return None


parent_dir = "/home/ramachandra/Videos/Imagenet/"
main_folder = "imagenet_class"
part_id = "n02352591"

download_tar(parent_dir, main_folder, part_id)

                                                                                                    

'/home/ramachandra/Videos/Imagenet/imagenet_class/n02352591.tar'