In [1]:
import torch
import clip
from PIL import Image
import requests

In [2]:
# 加载预训练的 CLIP 模型
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog", "a photo of a cat"]).to(device)

# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   # 
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

In [4]:
# 使用 CLIP 模型
with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0024, Dog: 0.9976
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 1.0000, Dog: 0.0002


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo without a dog", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0002, Dog: 1.0000
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9844, Dog: 0.0154


In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of no dog", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0013, Dog: 0.9985
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9990, Dog: 0.0010


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["dog a of photo a", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0016, Dog: 0.9985
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 1.0000, Dog: 0.0001


In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, but no cat", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.1348, Dog: 0.8652
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9995, Dog: 0.0004


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, but no cats", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0706, Dog: 0.9292
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9995, Dog: 0.0007


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of dogs", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0001, Dog: 1.0000
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9946, Dog: 0.0053


In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, but no cats, no horses, no wolves", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0331, Dog: 0.9668
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9990, Dog: 0.0011


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, no cat, no horses, no wolves", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0057, Dog: 0.9941
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9980, Dog: 0.0021


In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, but no taixs, no trucks, no airplanes", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0002, Dog: 1.0000
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.9966, Dog: 0.0034


In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, but no cars, no trucks, no airplanes", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0000, Dog: 1.0000
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 0.8599, Dog: 0.1404


In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, no cats, no cats, no cats, no cats", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0415, Dog: 0.9585
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 1.0000, Dog: 0.0001


In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 定义一个简单的任务：比较文本和图片
text = clip.tokenize(["a photo of a dog, dog! dog! dog! But no cats, no cats, no cats, no cats", "a photo of a cat"]).to(device)
# 假设我们有两张图片：一张猫的，一张狗的
image_urls = [
    "https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg",  #  
    "https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg"   #  
]
images = [preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) for url in image_urls]

with torch.no_grad():
    image_features = torch.cat([model.encode_image(image) for image in images])
    text_features = model.encode_text(text)

    # 计算图片和文本之间的相似度
    logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
    probs = logits_per_image.cpu().numpy()

print("Text to image probabilities:")
for i, url in enumerate(image_urls):
    print(f"{url}: Cat: {probs[i][0]:.4f}, Dog: {probs[i][1]:.4f}")

Text to image probabilities:
https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg: Cat: 0.0210, Dog: 0.9790
https://kb.rspca.org.au/wp-content/uploads/2018/11/golder-retriever-puppy.jpeg: Cat: 1.0000, Dog: 0.0001
