In [2]:
import torch
from PIL import Image
from transformers import ViTFeatureExtractor, ViTForImageClassification
import warnings
warnings.filterwarnings('ignore')

# Check if CUDA is available and set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the pre-trained Vision Transformer model and feature extractor
model_name = "google/vit-base-patch16-224"
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

# Move the model to the selected device (GPU or CPU)
model = model.to(device)

# Path to the image
image_path = "../data/01.jpeg"

# Load and preprocess the image
image = Image.open(image_path)

# Extract features from the image and move the input to the GPU
inputs = feature_extractor(images=image, return_tensors="pt").to(device)

# Perform inference on the GPU
outputs = model(**inputs)

# Move the logits back to the CPU to process the output
logits = outputs.logits.cpu()

# Get the predicted class index and label
predicted_class_idx = logits.argmax(-1).item()
predicted_label = model.config.id2label[predicted_class_idx]

# Extract the name of the food item
food_name = predicted_label.split(',')[0]

# Print the food name
print(food_name)


Using device: cuda
Granny Smith


In [5]:
import torch
from PIL import Image
from transformers import ViTFeatureExtractor, ViTForImageClassification
import warnings
warnings.filterwarnings('ignore')

# Check if CUDA is available and set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and feature extractor only once
def load_model_and_extractor(model_name="google/vit-base-patch16-224"):
    # Load the pre-trained Vision Transformer model and feature extractor
    feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
    model = ViTForImageClassification.from_pretrained(model_name)
    model = model.to(device)  # Move the model to GPU
    return model, feature_extractor

# Prediction function
def predict_image(image_path, model, feature_extractor):
    # Load and preprocess the image
    image = Image.open(image_path)

    # Extract features and move the input to the GPU
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)

    # Perform inference on the GPU
    with torch.no_grad():  # Disable gradient computation for inference
        outputs = model(**inputs)

    # Move the logits back to the CPU for processing
    logits = outputs.logits.cpu()

    # Get the predicted class index and label
    predicted_class_idx = logits.argmax(-1).item()
    predicted_label = model.config.id2label[predicted_class_idx]

    # Extract the name of the food item (or general label)
    food_name = predicted_label.split(',')[0]
    
    return food_name

# Load the model and feature extractor once
model, feature_extractor = load_model_and_extractor()

# Now, you can call the predict_image function multiple times without reloading the model
image_path = "../data/01.jpeg"
food_name = predict_image(image_path, model, feature_extractor)

# Print the food name
print(food_name)


Using device: cuda
Granny Smith
