<a href="https://colab.research.google.com/github/SabeehAnsari/Pytorch-Image-classifier/blob/main/Multi_modal_Homework_1820232042.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install torch torchvision torchaudio transformers

In [None]:
# Import necessary libraries
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import os

# Check if CUDA is available (for GPU usage)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load the model and processor from Hugging Face
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)

In [None]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16", use_fast=True)

In [None]:
# Function to read text from the corresponding text file
def read_text_file(text_file_path):
    with open(text_file_path, "r") as file:
        return file.read().strip()

In [None]:
# Function to process images and text and get similarity
def process_image_and_text(image_path, text_description):
    # Load image
    image = Image.open(image_path)

    # Prepare the image and text for CLIP
    inputs = processor(text=text_description, images=image, return_tensors="pt", padding=True)

    # Move inputs to the correct device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Calculate the similarity between image and text (logits_per_image)
    logits_per_image = outputs.logits_per_image  # Image-text similarity scores
    probs = logits_per_image.softmax(dim=1)  # Probabilities for each description

    return probs

In [19]:
# Folder containing your images and text files (adjust this if needed)
image_folder = "/content/Images/"

# List of images and corresponding text files
image_paths = [
    "Airplane.jpg", "Architecture.jpg", "Bird.jpg", "Butterfly.jpg", "Car.jpg",
    "Cat.jpg", "Child.jpg", "Eagle.jpg", "Flag.jpg", "Ship.jpg"
]

text_paths = [
    "Airplane.txt", "Architecture.txt", "Bird.txt", "Butterfly.txt", "Car.txt",
    "Cat.txt", "Child.txt", "Eagle.txt", "Flag.txt", "Ship.txt"
]


In [20]:
# Process each image and generate the best matching description
for image_file, text_file in zip(image_paths, text_paths):
    # Full paths to images and text files
    image_path = os.path.join(image_folder, image_file)
    text_path = os.path.join(image_folder, text_file)

    # Read the description from the text file
    description = read_text_file(text_path)

    # Get the probabilities (similarity scores) for the description
    probs = process_image_and_text(image_path, description)

    # Get the index of the description with the highest similarity
    best_description_index = torch.argmax(probs).item()
    best_description = description  # In this case, it's the only description, but you can adjust if needed

    print(f"Best description for {image_file}: {best_description}\n")

Best description for Airplane.jpg: A large commercial airplane on a runway, with its engines roaring and wheels touching the ground, preparing for takeoff under a clear blue sky.

Best description for Architecture.jpg: A view of the Colosseum in Rome, Italy, showcasing its ancient Roman architecture with arched windows and stonework, standing proudly against the grey sky.

Best description for Bird.jpg: A brightly colored bird with a red head and yellow body perched on a wooden branch, looking towards the camera with its sharp beak slightly open.

Best description for Butterfly.jpg: A beautiful butterfly perched delicately on a pink flower, its wings open wide, displaying a bright pattern of orange and black with white spots.

Best description for Car.jpg: A silver car captured in motion on a street, with motion blur emphasizing its speed as it moves past the camera.

Best description for Cat.jpg: A white cat standing on a wooden ledge, staring curiously ahead with its tail curved, giv

In [24]:
import os, glob, torch, pandas as pd
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)

folder = "/content/Images"   # adjust if needed
image_glob = sorted(glob.glob(os.path.join(folder, "*.jpg")) +
                    glob.glob(os.path.join(folder, "*.JPG")) +
                    glob.glob(os.path.join(folder, "*.png")) +
                    glob.glob(os.path.join(folder, "*.PNG")))

rows = []

print("Found images:", image_glob)

for img_path in image_glob:
    base = os.path.splitext(os.path.basename(img_path))[0]
    txt_path = os.path.join(folder, f"{base}.txt")

    if not os.path.exists(txt_path):
        print(f"⚠️ No txt file for {base} → skipping")
        continue

    with open(txt_path, "r", encoding="utf-8") as f:
        lines = [ln.strip() for ln in f.readlines() if ln.strip()]
    if not lines:
        print(f"⚠️ Empty txt file for {base} → skipping")
        continue

    image = Image.open(img_path).convert("RGB")
    inputs = processor(text=lines, images=image, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model(**inputs)
        logits = out.logits_per_image
        probs = logits.softmax(dim=1).squeeze(0).cpu().tolist()

    best_idx = int(torch.argmax(logits, dim=1).item())
    best_text = lines[best_idx]
    best_prob = float(probs[best_idx])

    rows.append({
        "image": os.path.basename(img_path),
        "num_candidates": len(lines),
        "best_text": best_text,
        "best_prob": best_prob
    })

# Only create DataFrame if rows is not empty
if rows:
    df = pd.DataFrame(rows).sort_values("image")
    csv_path = os.path.join(folder, "clip_results.csv")
    df.to_csv(csv_path, index=False)
    print(f"\n✅ Saved results → {csv_path}")
    display(df)
else:
    print("\n⚠️ No results — check file paths and names!")

Found images: ['/content/Images/Airplane.jpg', '/content/Images/Architecture.jpg', '/content/Images/Bird.jpg', '/content/Images/Butterfly.jpg', '/content/Images/Car.jpg', '/content/Images/Cat.jpg', '/content/Images/Child.jpg', '/content/Images/Eagle.jpg', '/content/Images/Flag.jpg', '/content/Images/Ship.jpg']

✅ Saved results → /content/Images/clip_results.csv


Unnamed: 0,image,num_candidates,best_text,best_prob
0,Airplane.jpg,1,"A large commercial airplane on a runway, with ...",1.0
1,Architecture.jpg,1,"A view of the Colosseum in Rome, Italy, showca...",1.0
2,Bird.jpg,1,A brightly colored bird with a red head and ye...,1.0
3,Butterfly.jpg,1,A beautiful butterfly perched delicately on a ...,1.0
4,Car.jpg,1,"A silver car captured in motion on a street, w...",1.0
5,Cat.jpg,1,"A white cat standing on a wooden ledge, starin...",1.0
6,Child.jpg,1,A young child in a blue shirt and brown shorts...,1.0
7,Eagle.jpg,1,"A close-up of a golden eagle, showcasing its s...",1.0
8,Flag.jpg,1,"A vibrant red flag of China, fluttering in the...",1.0
9,Ship.jpg,1,"A close-up of a large, illuminated ship docked...",1.0
