In [1]:
!pip install torch torchvision
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ypii0hsy
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-ypii0hsy
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone


In [10]:
import os
import csv
import clip
from PIL import Image
import torch

def load_clip_model():
    # Load the CLIP model from OpenAI
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    return model, preprocess, device

def predict_choice(model, preprocess, device, image_path, text_descriptions):
    # Preprocess the image and tokenize the text descriptions
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    text_tokens = clip.tokenize(text_descriptions).to(device)

    with torch.no_grad():
        # Encode image and text with the CLIP model
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_tokens)

        # Calculate the similarity scores and convert to probabilities
        logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
        probs = logits_per_image.cpu().numpy()

    return probs[0]




def main():
    model, preprocess, device = load_clip_model()
    base_dir = "past_data/clip_dataset"
    images_dir = os.path.join(base_dir, "images")
    csv_file_path = os.path.join(base_dir, "dataset.csv")

    with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        # 結果を保存するファイルを開く
    results_file_path = os.path.join(base_dir, "clip_results.txt")
    with open(results_file_path, 'w', newline='', encoding='utf-8') as results_file:    
        for row in reader:
            image_path = os.path.join(images_dir, row["image_filename"])
            text_description = row["text_description"]
            choices = row["button_texts"].split('|')

            # 'many' が text_description に含まれているかをチェック
            if "many" in text_description:
                parts = text_description.split(" ")
                try:
                    items_index = parts.index("many") + 1
                    items_phrase = " ".join(parts[items_index:-2]) 
                except ValueError:
                    # 'many' の後に 'are there' が見つからない場合はスキップ
                    print(f"Error in text description for image: {row['image_filename']}")
                    continue

                # 各選択肢に対する説明文を生成
                text_descriptions = [f"There are {choice} {items_phrase}." for choice in choices]
            else:
                # 'many' がなければ、デフォルトのフレーズを使用
                text_descriptions = [f"There are {choice} items." for choice in choices]

            # CLIPモデルによる予測
            probs = predict_choice(model, preprocess, device, image_path, text_descriptions)
            best_choice_index = probs.argmax()
            selected_choice = choices[best_choice_index] if choices[best_choice_index] != ' ' else 'No valid choice'
            print(f"Image: {row['image_filename']}, Best choice: {selected_choice}, Probability: {probs[best_choice_index]:.4f}")
            

            results_file.write(f"{selected_choice}\n")

            
if __name__ == "__main__":
    main()


ValueError: I/O operation on closed file.

In [13]:
import os
import csv
import clip
from PIL import Image
import torch

def load_clip_model():
    # Load the CLIP model from OpenAI
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    return model, preprocess, device

def predict_choice(model, preprocess, device, image_path, text_descriptions):
    # Preprocess the image and tokenize the text descriptions
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    text_tokens = clip.tokenize(text_descriptions).to(device)

    with torch.no_grad():
        # Encode image and text with the CLIP model
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_tokens)

        # Calculate the similarity scores and convert to probabilities
        logits_per_image = (image_features @ text_features.T).softmax(dim=-1)
        probs = logits_per_image.cpu().numpy()

    return probs[0]


def main():
    model, preprocess, device = load_clip_model()
    base_dir = "past_data/clip_dataset"
    images_dir = os.path.join(base_dir, "images")
    csv_file_path = os.path.join(base_dir, "dataset.csv")
    results_file_path = os.path.join(base_dir, "clip_results.txt")

    with open(csv_file_path, 'r', newline='', encoding='utf-8') as file, open(results_file_path, 'w', newline='', encoding='utf-8') as results_file:
        reader = csv.DictReader(file)
        for row in reader:
            image_path = os.path.join(images_dir, row["image_filename"])
            text_description = row["text_description"]
            choices = row["button_texts"].split('|')

            # 'many' が text_description に含まれているかをチェック
            if "many" in text_description:
                parts = text_description.split(" ")
                try:
                    items_index = parts.index("many") + 1
                    items_phrase = " ".join(parts[items_index:-2]) 
                except ValueError:
                    # 'many' の後に 'are there' が見つからない場合はスキップ
                    print(f"Error in text description for image: {row['image_filename']}")
                    continue

                # 各選択肢に対する説明文を生成
                text_descriptions = [f"There are {choice} {items_phrase}." for choice in choices]
            else:
                # 'many' がなければ、デフォルトのフレーズを使用
                text_descriptions = [f"There are {choice} items." for choice in choices]
                
            # CLIPモデルによる予測と結果の書き出し
           
            probs = predict_choice(model, preprocess, device, image_path, text_descriptions)
            best_choice_index = probs.argmax()
            selected_choice = choices[best_choice_index] if choices[best_choice_index] != ' ' else 'No valid choice'
            print(f"Image: {row['image_filename']}, Best choice: {selected_choice}, Probability: {probs[best_choice_index]:.4f}")
            
            results_file.write(f"{selected_choice}\n")

if __name__ == "__main__":
    main()


Image: image_0001.png, Best choice: 3, Probability: 0.3972
Image: image_0002.png, Best choice: 0, Probability: 0.2296
Image: image_0003.png, Best choice: 0, Probability: 0.2129
Image: image_0004.png, Best choice: No valid choice, Probability: 0.2367
Image: image_0005.png, Best choice: 3, Probability: 0.2705
Image: image_0006.png, Best choice: 2, Probability: 0.4575
Image: image_0007.png, Best choice: 2, Probability: 0.3918
Image: image_0008.png, Best choice: 2, Probability: 0.4099
Image: image_0009.png, Best choice: 1, Probability: 0.2292
Image: image_0010.png, Best choice: 2, Probability: 0.7500
Image: image_0011.png, Best choice: 1, Probability: 0.2500
Image: image_0012.png, Best choice: 3, Probability: 0.3281
Image: image_0013.png, Best choice: 0, Probability: 0.1383
Image: image_0014.png, Best choice: 3, Probability: 0.3506
Image: image_0015.png, Best choice: 3, Probability: 0.3540
Image: image_0016.png, Best choice: No valid choice, Probability: 0.2064
Image: image_0017.png, Best 