In [1]:
# dependency
!pip install py7zr

Collecting py7zr
  Downloading py7zr-1.1.2-py3-none-any.whl.metadata (17 kB)
Collecting brotli>=1.2.0 (from py7zr)
  Downloading brotli-1.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (6.1 kB)
Collecting backports.zstd>=1.0.0 (from py7zr)
  Downloading backports_zstd-1.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (6.9 kB)
Collecting pyppmd>=1.3.1 (from py7zr)
  Downloading pyppmd-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting pybcj>=1.0.6 (from py7zr)
  Downloading pybcj-1.0.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64>=1.0.4 (from py7zr)
  Downloading inflate64-1.0.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86

In [2]:
# Unziping dirs
import os
import py7zr

pairs = [
    ("/kaggle/input/cifar-10/train.7z", "/kaggle/working/train"),
    ("/kaggle/input/cifar-10/test.7z",  "/kaggle/working/test"),
]

for file_path, out_dir in pairs:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Missing archive: {file_path}")

    os.makedirs(out_dir, exist_ok=True)

    # If directory already has files, skip it
    if any(os.scandir(out_dir)):
        print(f"Skipping (already extracted): {out_dir}")
        continue

    print(f"Extracting {file_path} -> {out_dir}")
    with py7zr.SevenZipFile(file_path, mode="r") as z:
        z.extractall(path=out_dir)


Extracting /kaggle/input/cifar-10/train.7z -> /kaggle/working/train
Extracting /kaggle/input/cifar-10/test.7z -> /kaggle/working/test


In [3]:
import os, re, glob
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
import torch
from transformers import pipeline

TEST_DIR = "/kaggle/working/test/test"
OUT_CSV  = "/kaggle/working/sampleSubmission.csv"

CANDIDATE_LABELS = [
    "airplane", "automobile", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]
HYPOTHESIS_TEMPLATE = "a photo of a {}"

def numeric_stem(path: str):
    stem = os.path.splitext(os.path.basename(path))[0]
    m = re.search(r"\d+", stem)
    return int(m.group()) if m else None

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

# Collect images
img_paths = []
for ext in ("png", "jpg", "jpeg", "webp", "bmp"):
    img_paths.extend(glob.glob(os.path.join(TEST_DIR, f"**/*.{ext}"), recursive=True))

if not img_paths:
    raise FileNotFoundError(f"No images found under {TEST_DIR}")

# Sort by numeric id if possible
if all(numeric_stem(p) is not None for p in img_paths):
    img_paths = sorted(img_paths, key=numeric_stem)
else:
    img_paths = sorted(img_paths)

device = 0 if torch.cuda.is_available() else -1

# batch_size here controls internal batching when you pass many inputs
BATCH_SIZE = 64

clf = pipeline(
    task="zero-shot-image-classification",
    model="openai/clip-vit-large-patch14-336",
    device_map="auto",
    batch_size=BATCH_SIZE,
)

rows = []
for paths_batch in tqdm(list(chunks(img_paths, BATCH_SIZE)), desc="Predicting", unit="batch"):
    imgs = [Image.open(p).convert("RGB") for p in paths_batch]

    outs = clf(
        imgs,
        candidate_labels=CANDIDATE_LABELS,
        hypothesis_template=HYPOTHESIS_TEMPLATE,
        top_k=1,
    )

    for p, out in zip(paths_batch, outs):
        pred = out[0]["label"]
        fid = numeric_stem(p)
        rows.append({"id": fid if fid is not None else None, "label": pred})

sub = pd.DataFrame(rows)

# If filenames aren't numeric, fallback to 1..N
if sub["id"].isna().any():
    sub["id"] = range(1, len(sub) + 1)

sub = sub.sort_values("id")[["id", "label"]]
sub.to_csv(OUT_CSV, index=False)

print("Wrote:", OUT_CSV)
print(sub.head())


2026-01-19 16:10:41.316475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768839041.558140      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768839041.624760      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768839042.171146      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768839042.171178      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768839042.171181      23 computation_placer.cc:177] computation placer alr

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


Predicting:   0%|          | 0/4688 [00:00<?, ?batch/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Wrote: /kaggle/working/sampleSubmission.csv
   id       label
0   1        bird
1   2        bird
2   3  automobile
3   4        ship
4   5    airplane
