# KEY MODIFICATIONS

1. Load the three specific images manually using a file path 
2. Prepocess the images using the "prepocess" function from CLIP 
3. Compute the embeddings for these images using the model
4. Compute the similarity scores against the predefined text features 

### Erklärungen 

1. Specific Image Paths: The Image_paths list contains the file paths of the three JPEG images you wnar to test 
2. Image oading: Each image is loaded using PIL.IMAGe and converted to RGB to ensure compatability with CLIP 
3. Prpeocessing: Each image is preprocessed sing the preprocess function from CLIP to prepare it for the model 
4. Embedding and Similarity Calculation: The image embedding is calcualted, normlaized and compared against the text emebddings using the dot product 
5. Save Results: The similarity results are saved in a final_res/sim_violence_test.torch file for later use
6. Error Handling: Added a try-except block to handle potential issues like missing files or unsupported image formats

In [2]:
import clip
import torch
from PIL import Image

In [3]:
# Paths to the 3 specific JPEG images
#image_paths = ["path/to/image1.jpg", "path/to/image2.jpg", "path/to/image3.jpg"]
image_paths = "violence_detection/Katharina_pytorch_Comparison/images_test"

In [4]:
# Load CLIP model
model_name = "ViT-L/14"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model, preprocess = clip.load(model_name, device=device)
model.eval()

AttributeError: module 'clip' has no attribute 'load'

In [None]:
# Define safety-related text prompts
text_prompts = [
    "a photo of a gun", "a photo of a shooting", "a photo of a weapon",
    "a photo of a killing", "a photo of a bomb", "a photo of violence",
    "a photo of other", "a photo of an animal", "a photo of a family",
    "a photo of a meeting", "a photo of a tool", "a photo of a gathering"
]
text_tokens = clip.tokenize(text_prompts).to(device)

In [None]:
# Compute text embeddings
with torch.no_grad():
    text_features = model.encode_text(text_tokens).to(torch.float32)
    text_features /= text_features.norm(dim=-1, keepdim=True)


In [None]:
# Process and evaluate the images
final_sim = {}
for image_path in image_paths:
    try:
        # Load and preprocess the image
        image = Image.open(image_path).convert("RGB")
        image_input = preprocess(image).unsqueeze(0).to(device)

        # Compute image embedding
        with torch.no_grad():
            image_features = model.encode_image(image_input).to(torch.float32)
            image_features /= image_features.norm(dim=-1, keepdim=True)

        # Compute similarity with text features
        similarity = image_features @ text_features.T

        # Store the results
        final_sim[image_path] = similarity.cpu().numpy()

    except Exception as e:
        print(f"Error processing {image_path}: {e}")


In [None]:
# Save the results
torch.save(final_sim, "final_res/sim_violence_test.torch")

# Print results
for image_path, sim in final_sim.items():
    print(f"Similarity scores for {image_path}: {sim}")