In [1]:
!pip install transformers

!pip install timm  
!pip install pillow numpy matplotlib
!pip install opencv-python
!pip install pandas

!pip install datasets  



In [2]:
import torch
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

print("\nLoading ViT model...")
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

# FORCE GPU usage
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    model = model.to(device)
    print(f"Model moved to: {device}")
else:
    device = torch.device("cpu")
    print("CUDA not available, using CPU")

print(f"Using device: {device}")

# Load a test image
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

# Process image and EXPLICITLY move to GPU
inputs = processor(images=image, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}  # Move all tensors to GPU

print("\nRunning inference...")

# Inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    
# Get top 5 predictions
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
top5_prob, top5_idx = torch.topk(probs, 5)

print("\nTop 5 predictions:")
for i in range(5):
    class_idx = top5_idx[i].item()
    prob = top5_prob[i].item()
    class_name = model.config.id2label[class_idx]
    print(f"{i+1}. {class_name}: {prob*100:.2f}%")

print("\nViT test successful!")

# Verify GPU memory usage
if torch.cuda.is_available():
    print(f"\nGPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"GPU Memory cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")



PyTorch version: 2.9.0+cu128
CUDA available: True
CUDA version: 12.8
GPU count: 1
GPU name: NVIDIA GeForce RTX 5090

Loading ViT model...
Model moved to: cuda:0
Using device: cuda:0

Running inference...

Top 5 predictions:
1. Egyptian cat: 93.74%
2. tabby, tabby cat: 3.84%
3. tiger cat: 1.44%
4. lynx, catamount: 0.33%
5. Siamese cat, Siamese: 0.07%

ViT test successful!

GPU Memory allocated: 339.94 MB
GPU Memory cached: 390.00 MB


In [3]:
import zipfile
import os
from pathlib import Path

# Create data directory
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

# Extract files
print("Extracting datasets...")
for split in ['train', 'val', 'test']:
    zip_path = f"{split}.zip"
    extract_path = data_dir / split
    
    if os.path.exists(zip_path):
        print(f"Extracting {split}.zip...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print(f"{split} extracted to {extract_path}")
    else:
        print(f"{split}.zip not found")

print("\nExtraction complete!")

Extracting datasets...
Extracting train.zip...


KeyboardInterrupt: 

In [3]:
import json
from pathlib import Path
from collections import Counter

# Load validation annotations with UTF-8 encoding
with open('data/annotations/val.json', 'r', encoding='utf-8') as f:
    val_data = json.load(f)

print(f"Total validation samples: {len(val_data)}")

# Analyze answer types
answer_types = [item['answer_type'] for item in val_data]
type_counts = Counter(answer_types)

print("\nAnswer type distribution:")
for answer_type, count in type_counts.items():
    print(f"  {answer_type}: {count} ({count/len(val_data)*100:.1f}%)")

# Show sample
print("\nSample entry:")
sample = val_data[0]
print(f"Image: {sample['image']}")
print(f"Question: {sample['question']}")
print(f"Answerable: {sample['answerable']}")
print(f"Answer type: {sample['answer_type']}")
print(f"Answers: {sample['answers'][:3]}")  # Show first 3 answers

Total validation samples: 4319

Answer type distribution:
  unanswerable: 1385 (32.1%)
  other: 2691 (62.3%)
  yes/no: 195 (4.5%)
  number: 48 (1.1%)

Sample entry:
Image: VizWiz_val_00000000.jpg
Question: Ok. There is another picture I hope it is a better one.
Answerable: 0
Answer type: unanswerable
Answers: [{'answer': 'unanswerable', 'answer_confidence': 'yes'}, {'answer': 'unanswerable', 'answer_confidence': 'yes'}, {'answer': 'unanswerable', 'answer_confidence': 'yes'}]


In [6]:
import torch
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import json
from pathlib import Path
import random

print("Loading ViT model...")
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
print(f"Using device: {device}\n")

# Load validation annotations
print("Loading VizWiz validation data...")
with open('data/annotations/val.json', 'r', encoding='utf-8') as f:
    val_data = json.load(f)

# Filter answerable questions only
answerable_data = [item for item in val_data if item['answerable'] == 1]
print(f"Found {len(answerable_data)} answerable questions\n")

# Test on random samples
num_samples = 5
samples = random.sample(answerable_data, num_samples)

print(f"Testing ViT on {num_samples} random VizWiz images:\n")
print("="*80)

for idx, item in enumerate(samples, 1):
    image_path = Path('data/val') / item['image']
    
    # Check if image exists
    if not image_path.exists():
        print(f"Sample {idx}: Image not found - {image_path}")
        continue
    
    # Load and process image
    try:
        image = Image.open(image_path).convert('RGB')
        
        print(f"\nSample {idx}:")
        print(f"Image: {item['image']}")
        print(f"Question: {item['question']}")
        
        # Get ground truth answers
        answers = [ans['answer'] for ans in item['answers'] if ans['answer'] != 'unanswerable']
        if answers:
            answer_counts = {}
            for ans in answers:
                answer_counts[ans] = answer_counts.get(ans, 0) + 1
            most_common = max(answer_counts, key=answer_counts.get)
            print(f"Ground truth: {most_common} ({answer_counts[most_common]}/{len(item['answers'])} annotators)")
        
        # ViT prediction
        inputs = processor(images=image, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)[0]
            top3_prob, top3_idx = torch.topk(probs, 3)
        
        print("ViT Top 3 predictions:")
        for i in range(3):
            class_idx = top3_idx[i].item()
            prob = top3_prob[i].item()
            class_name = model.config.id2label[class_idx]
            print(f"  {i+1}. {class_name}: {prob*100:.2f}%")
        
        print("-"*80)
        
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        continue

print("\nQuick test complete!")

Loading ViT model...
Using device: cuda

Loading VizWiz validation data...
Found 2934 answerable questions

Testing ViT on 5 random VizWiz images:


Sample 1:
Image: VizWiz_val_00001395.jpg
Question: What is on this shelf?
Ground truth: beer (4/10 annotators)
ViT Top 3 predictions:
  1. grocery store, grocery, food market, market: 83.68%
  2. shopping basket: 2.73%
  3. packet: 2.57%
--------------------------------------------------------------------------------

Sample 2:
Image: VizWiz_val_00003203.jpg
Question: What color is this shirt?
Ground truth: purple (5/10 annotators)
ViT Top 3 predictions:
  1. velvet: 76.11%
  2. wool, woolen, woollen: 1.16%
  3. letter opener, paper knife, paperknife: 0.80%
--------------------------------------------------------------------------------

Sample 3:
Image: VizWiz_val_00001087.jpg
Question: What is the computer screen showing?
Ground truth: restore (2/10 annotators)
ViT Top 3 predictions:
  1. web site, website, internet site, site: 86.77%
  