In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
from gtts import gTTS
import cv2  
import os

model = VisionEncoderDecoderModel.from_pretrained('C:\\Users\\saisu\\Documents\\SNAPSENSE\\vit-gpt2-image-captioning')
feature_extractor = ViTImageProcessor.from_pretrained('C:\\Users\\saisu\\Documents\\SNAPSENSE\\vit-gpt2-image-captioning')
tokenizer = AutoTokenizer.from_pretrained('C:\\Users\\saisu\\Documents\\SNAPSENSE\\vit-gpt2-image-captioning')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

max_length = 32
num_beams = 8
gen_kwargs = {'max_length': max_length, 'num_beams': num_beams}

def capture_image(filename='webcam_image.jpg'):
    import cv2

def capture_image(filename='captured_image.jpg'):
    # Open the default camera (camera index 0)
    cap = cv2.VideoCapture(0)
    
    if not cap.isOpened():
        print("Error: Could not access the camera.")
        return None

    try:
        print("Press 's' to capture an image or 'q' to quit.")

        while True:
            ret, frame = cap.read()
            if not ret:
                print("Error: Failed to capture an image.")
                break

            cv2.imshow("Camera Feed", frame)

            key = cv2.waitKey(1) & 0xFF
            if key == ord('s'):  
                cv2.imwrite(filename, frame)
                print(f"Image saved as {filename}")
                break
            elif key == ord('q'):  
                print("Exiting without capturing.")
                filename = None
                break
    finally:
        cap.release()
        cv2.destroyAllWindows()
        return filename


def predict_step(image):
    pixel_values = feature_extractor(images=[image], return_tensors='pt').pixel_values
    pixel_values = pixel_values.to(device)
    output_ids = model.generate(pixel_values, **gen_kwargs)
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds[0]

def play_audio(audio_path):
    if os.name == 'nt': 
        os.system(f'start {audio_path}')


print("Press Enter to capture an image and predict the caption. Type 'exit' to stop.")

while True:
    user_input = input("Type 'Enter' to capture an image or 'exit' to quit: ").strip()

    if user_input.lower() == 'exit':
        print("Exiting...")
        break

    image_path = capture_image()
    if not image_path:
        continue

    try:
        image = Image.open(image_path)
    except Exception as e:
        print(f"Error loading image: {e}")
        continue

    caption = predict_step(image=image)
    print(f"Predicted Caption: {caption}")

    tts = gTTS(text=caption, lang='en')
    audio_path = 'caption_audio.mp3'
    tts.save(audio_path)

    play_audio(audio_path)

print("Program terminated.")
