In [21]:
import torch
import clip
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
import os
import json

In [2]:
import cv2
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)


In [26]:
import os
print("Current Working Directory:", os.getcwd())


Current Working Directory: d:\Projects\Visual AI


In [19]:
class CustomDataset(Dataset):
    def __init__(self, image_dir, annotation_file, preprocess):
        """
        Args:
            image_dir (str): Path to the directory containing images.
            annotation_file (str): Path to the JSON file with image-caption mappings.
            preprocess (callable): Preprocessing function for images.
        """
        self.image_dir = image_dir
        self.annotations = json.load(open(annotation_file, 'r'))
        self.preprocess = preprocess

    def __len__(self):
        return len(self.annotations['images'])

    def __getitem__(self, idx):
        img_info = self.annotations['images'][idx]
        img_path = os.path.join(self.image_dir, img_info['file_name'])
        caption = img_info['caption']

        # Load and preprocess the image
        image = self.preprocess(Image.open(img_path).convert("RGB"))

        # Tokenize the caption using CLIP's tokenizer
        text = clip.tokenize([caption])[0]

        return image, text

In [40]:
image_dir = "d:/Projects/Visual AI/Train/images"
if not os.path.exists(image_dir):
    print(f"File not found: {image_dir}")

annotation_file = "d:/Projects/Visual AI/Train/captions.json" 

In [41]:
dataset = CustomDataset(image_dir=image_dir, annotation_file=annotation_file, preprocess=preprocess)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [42]:
for images, texts in dataloader:
    print("Image batch shape:", images.shape)
    print("Text batch shape:", texts.shape)
    break  # Just to show one batch

Image batch shape: torch.Size([2, 3, 224, 224])
Text batch shape: torch.Size([2, 77])


In [43]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [46]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for images, texts in dataloader:
        images, texts = images.to(device), texts.to(device)

        # Forward pass
        logits_per_image, logits_per_text = model(images, texts)
        ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
        
        # Compute loss
        loss = (loss_fn(logits_per_image, ground_truth) + loss_fn(logits_per_text, ground_truth)) / 2

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")

# Save the fine-tuned model
torch.save(model.state_dict(), "fine_tuned_clip_vit_b32.pth")

Epoch [1/10], Loss: 1.3367
Epoch [2/10], Loss: 0.7306
Epoch [3/10], Loss: 0.9484
Epoch [4/10], Loss: 0.6948
Epoch [5/10], Loss: 0.6948
Epoch [6/10], Loss: 0.6936
Epoch [7/10], Loss: 0.6998
Epoch [8/10], Loss: 0.6942
Epoch [9/10], Loss: 0.6957
Epoch [10/10], Loss: 0.6954


In [45]:
model.load_state_dict(torch.load("fine_tuned_clip_vit_b32.pth"))
model.eval()

text_prompts = ["a person hitting", "a person standing"]
text_tokens = clip.tokenize(text_prompts).to(device)

# Start webcam capture
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess the frame for CLIP model input
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    image_input = preprocess(pil_image).unsqueeze(0).to(device)

    with torch.no_grad():
        # Encode image and text features
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_tokens)

        # Compute similarity scores and probabilities
        logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    
    probs = logits_per_image.cpu().numpy()[0]

    # Display predictions on the video feed
    for i, prompt in enumerate(text_prompts):
        cv2.putText(frame, f"{prompt}: {probs[i]:.2f}", (10, 30 + i * 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    cv2.imshow("CLIP Webcam", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
