In [None]:
# import libraries
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np
import os
import random
import pandas as pd

# Flickr30k entities utils
The code below is referenced from https://github.com/BryanPlummer/flickr30k_entities/blob/master/flickr30k_entities_utils.py

In [None]:
def get_sentence_data(fn):
    """
    Parses a sentence file from the Flickr30K Entities dataset

    input:
      fn - full file path to the sentence file to parse
    
    output:
      a list of dictionaries for each sentence with the following fields:
          sentence - the original sentence
          phrases - a list of dictionaries for each phrase with the
                    following fields:
                      phrase - the text of the annotated phrase
                      first_word_index - the position of the first word of
                                         the phrase in the sentence
                      phrase_id - an identifier for this phrase
                      phrase_type - a list of the coarse categories this 
                                    phrase belongs to

    """
    with open(fn, 'r') as f:
        sentences = f.read().split('\n')

    annotations = []
    for sentence in sentences:
        if not sentence:
            continue

        first_word = []
        phrases = []
        phrase_id = []
        phrase_type = []
        words = []
        current_phrase = []
        add_to_phrase = False
        for token in sentence.split():
            if add_to_phrase:
                if token[-1] == ']':
                    add_to_phrase = False
                    token = token[:-1]
                    current_phrase.append(token)
                    phrases.append(' '.join(current_phrase))
                    current_phrase = []
                else:
                    current_phrase.append(token)

                words.append(token)
            else:
                if token[0] == '[':
                    add_to_phrase = True
                    first_word.append(len(words))
                    parts = token.split('/')
                    phrase_id.append(parts[1][3:])
                    phrase_type.append(parts[2:])
                else:
                    words.append(token)

        sentence_data = {'sentence' : ' '.join(words), 'phrases' : []}
        for index, phrase, p_id, p_type in zip(first_word, phrases, phrase_id, phrase_type):
            sentence_data['phrases'].append({'first_word_index' : index,
                                             'phrase' : phrase,
                                             'phrase_id' : p_id,
                                             'phrase_type' : p_type})

        annotations.append(sentence_data)

    return annotations

def get_annotations(fn):
    """
    Parses the xml files in the Flickr30K Entities dataset

    input:
      fn - full file path to the annotations file to parse

    output:
      dictionary with the following fields:
          scene - list of identifiers which were annotated as
                  pertaining to the whole scene
          nobox - list of identifiers which were annotated as
                  not being visible in the image
          boxes - a dictionary where the fields are identifiers
                  and the values are its list of boxes in the 
                  [xmin ymin xmax ymax] format
    """
    tree = ET.parse(fn)
    root = tree.getroot()
    size_container = root.findall('size')[0]
    anno_info = {'boxes' : {}, 'scene' : [], 'nobox' : []}
    for size_element in size_container:
        anno_info[size_element.tag] = int(size_element.text)

    for object_container in root.findall('object'):
        for names in object_container.findall('name'):
            box_id = names.text
            box_container = object_container.findall('bndbox')
            if len(box_container) > 0:
                if box_id not in anno_info['boxes']:
                    anno_info['boxes'][box_id] = []
                xmin = int(box_container[0].findall('xmin')[0].text) - 1
                ymin = int(box_container[0].findall('ymin')[0].text) - 1
                xmax = int(box_container[0].findall('xmax')[0].text) - 1
                ymax = int(box_container[0].findall('ymax')[0].text) - 1
                anno_info['boxes'][box_id].append([xmin, ymin, xmax, ymax])
            else:
                nobndbox = int(object_container.findall('nobndbox')[0].text)
                if nobndbox > 0:
                    anno_info['nobox'].append(box_id)

                scene = int(object_container.findall('scene')[0].text)
                if scene > 0:
                    anno_info['scene'].append(box_id)

    return anno_info



In [None]:
dictionary=get_sentence_data('test/4926723.txt')

In [None]:
dictionary

[{'sentence': 'A man whose face is only partially seen is sitting down and writing something with a pen on paper with a cup in front of him with Disney characters .',
  'phrases': [{'first_word_index': 0,
    'phrase': 'A man',
    'phrase_id': '219060',
    'phrase_type': ['people']},
   {'first_word_index': 2,
    'phrase': 'whose face',
    'phrase_id': '219066',
    'phrase_type': ['bodyparts']},
   {'first_word_index': 13,
    'phrase': 'something',
    'phrase_id': '219065',
    'phrase_type': ['other']},
   {'first_word_index': 15,
    'phrase': 'a pen',
    'phrase_id': '219064',
    'phrase_type': ['other']},
   {'first_word_index': 18,
    'phrase': 'paper',
    'phrase_id': '219062',
    'phrase_type': ['other']},
   {'first_word_index': 20,
    'phrase': 'a cup',
    'phrase_id': '219059',
    'phrase_type': ['other']},
   {'first_word_index': 25,
    'phrase': 'him',
    'phrase_id': '0',
    'phrase_type': ['notvisual']},
   {'first_word_index': 27,
    'phrase': 'Disney 

In [None]:
first_sentence = dictionary[0]['sentence']

In [None]:
first_sentence

'A man whose face is only partially seen is sitting down and writing something with a pen on paper with a cup in front of him with Disney characters .'

# Show visualization

In [None]:
def visualize_data_1(sentences, annotations, image):
    '''
    visualiza the annotated image
    '''
    image = Image.open(image).convert('RGB')
    image_array = np.array(image).astype(float) / 255.0
    
    # Define a list of colors for bounding boxes
    bbox_colors = ['red', 'blue', 'green', 'orange', 'purple','yellow','black','white']
    bbox_color_index = 0
    
    # Use the first sentence
    sentence = sentences[0]['sentence']
    phrases = sentences[0]['phrases']
        
    # Create a figure and axis
    fig, ax = plt.subplots(1)
    ax.imshow(image_array)  # Assuming you have an image to display
        
    # Iterate over each phrase in the sentence
    for phrase_data in phrases:
        phrase = phrase_data['phrase']
        first_word_index = phrase_data['first_word_index']
        phrase_id = phrase_data['phrase_id']
        phrase_type = phrase_data['phrase_type']
            
        # Get the bounding box for the phrase_id
        boxes = annotations['boxes'].get(phrase_id, [])
            
        # Display the bounding boxes
        for box in boxes:
            xmin, ymin, xmax, ymax = box
                
            # Get the color for the bounding box
            bbox_color = bbox_colors[bbox_color_index % len(bbox_colors)]
            bbox_color_index += 1
                
            # Create a rectangle patch
            rect = patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, linewidth=1, edgecolor=bbox_color, facecolor='none')
                
            # Add the rectangle to the axis
            ax.add_patch(rect)
            
            # Display the phrase as a keyword with the same color as the bounding box
            ax.text(xmin, ymin, phrase, fontsize=8, color=bbox_color, verticalalignment='top')
            # ax.text(xmin, ymin, phrase, fontsize=10, color=bbox_color, verticalalignment='top')


    # Show the plot
    ax.axis("off")
    plt.show()

In [None]:
def visualize_data_2(sentences, annotations, image, output_folder):
    '''
    Save the annotated image to the output folder
    '''
    image = Image.open(image).convert('RGB')
    image_array = np.array(image).astype(float) / 255.0
    
    # Define a list of colors for bounding boxes
    bbox_colors = ['red', 'blue', 'green', 'orange', 'purple', 'yellow', 'black', 'white']
    bbox_color_index = 0
    
    # Use the first sentence 
    sentence = sentences[0]['sentence']
    phrases = sentences[0]['phrases']
    
    # Create a figure and axis
    fig, ax = plt.subplots(1)
    ax.imshow(image_array)  # Assuming you have an image to display
    
    # Iterate over each phrase in the sentence
    for phrase_data in phrases:
        phrase = phrase_data['phrase']
        first_word_index = phrase_data['first_word_index']
        phrase_id = phrase_data['phrase_id']
        phrase_type = phrase_data['phrase_type']
        
        # Get the bounding box for the phrase_id
        boxes = annotations['boxes'].get(phrase_id, [])
        
        # Display the bounding boxes
        for box in boxes:
            xmin, ymin, xmax, ymax = box
            
            # Get the color for the bounding box
            bbox_color = bbox_colors[bbox_color_index % len(bbox_colors)]
            bbox_color_index += 1
            
            # Create a rectangle patch
            rect = patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, linewidth=1, edgecolor=bbox_color, facecolor='none')
            
            # Add the rectangle to the axis
            ax.add_patch(rect)
            
            # Display the phrase as a keyword with the same color as the bounding box
            ax.text(xmin, ymin, phrase, fontsize=8, color=bbox_color, verticalalignment='top')
    
    # Show the plot
    ax.axis("off")
    plt.savefig(os.path.join(output_folder, filename.replace('.txt', '.jpg')), dpi=300)
    plt.close()


In [None]:
# sentences = get_sentence_data('test/4926723.txt')
# annotations = get_annotations('test/4926723.xml')
# image = 'test/4926723.jpg'

In [None]:
# visualize_data_1(sentences, annotations, image)

In [None]:
# visualize_data_2(sentences, annotations, image, "output")

# Visualize the entire folder -Setting up the Ground Truth

In [None]:
# create an output folder for ground truth 
folder_path = "test"
output_folder = "output"

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        sentence_file = os.path.join(folder_path, filename)
        sentences = get_sentence_data(sentence_file)

        annotation_file = os.path.join(folder_path, filename.replace('.txt', '.xml'))
        annotations = get_annotations(annotation_file)

        image_file = os.path.join(folder_path, filename.replace('.txt', '.jpg'))

        visualize_data_2(sentences, annotations, image_file, output_folder)


In [None]:
#!rm -rf "output"

In [None]:
# download the folder

# Method 1
Apply Method 1 on Natualistic images 

## Object detection
given an image, return possible IEs

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
!pip install timm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.9.2-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors (from timm)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, timm
Successfully installed safetensors-0.3.1 timm-0.9.2


In [None]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch

# DETR 
def object_detection(image_file):
    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

    image = Image.open(image_file).convert('RGB')
    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    # convert outputs (bounding boxes and class logits) to COCO API
    # let's only keep detections with score > 0.8
    target_sizes = torch.tensor([image.size[::-1]])
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.8)[0]

    detected_objects = []
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
      box = [round(i, 2) for i in box.tolist()]
      detected_objects.append({
          "class_name": model.config.id2label[label.item()],
          "bbox": box
          })
    return detected_objects


In [None]:
from transformers import YolosFeatureExtractor, YolosForObjectDetection

# YOLO
def object_detection_YOLO(image_file):

    processor = YolosFeatureExtractor.from_pretrained('hustvl/yolos-tiny')
    model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')

    image = Image.open(image_file).convert('RGB')
    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    # convert outputs (bounding boxes and class logits) to COCO API
    # let's only keep detections with score > 0.8
    target_sizes = torch.tensor([image.size[::-1]])
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.8)[0]

    detected_objects = []
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
      box = [round(i, 2) for i in box.tolist()]
      detected_objects.append({
          "class_name": model.config.id2label[label.item()],
          "bbox": box
          })
    return detected_objects

## TE retrieval 

1. retrieve nouns (TE1)
2. named entity recognition (TE2) 

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
def TE1(sentence):
  # Remove punctuation characters
  # translator = str.maketrans('', '', string.punctuation)
  translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
  text = sentence.translate(translator)

  # Tokenize the text into individual words
  words = word_tokenize(text)
  word_list = [word.lower() for word in words]
  # Remove stop words such as "is," "an," and "which"
  filtered_words = [word for word in word_list if word not in stopwords.words("english")]
  # Extract only the nouns from the text
  nouns = [word for (word, pos) in nltk.pos_tag(filtered_words) if pos[:2] == "NN"]

  return nouns

In [None]:
TE1('A girl is on rollerskates talking on her cellphone standing in a parking lot')

['girl', 'cellphone', 'parking', 'lot']

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

ner_pipeline = pipeline("ner", model="dbmdz/electra-large-discriminator-finetuned-conll03-english", tokenizer="dbmdz/electra-large-discriminator-finetuned-conll03-english")

def TE2(sentence):
    named_ents = ner_pipeline(sentence)

    filtered_list = []
    i = 0
    while i < len(named_ents):
        if named_ents[i]['score'] > 0.8:
            word = named_ents[i]['word']
            i += 1
            while i < len(named_ents) and named_ents[i]['index'] == named_ents[i-1]['index'] + 1:
                word += ' ' + named_ents[i]['word']
                i += 1
            word = word.replace('##', '')  # remove special characters
            filtered_list.append(word)
            #filtered_list.append(word.title())
        else:
            i += 1
    
    return filtered_list

Downloading (…)lve/main/config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
TE2('HOSTILE AIRCRAFT 57 WAS NO ORDINARY GERMAN PLANE. IN THE COCKPIT SAT FLIEGER LEUTNANT FRANZ VON STEIN - THE BLACK ACE - PROUD, CONCEITED - HERO OF THE LUFTWAFFE. ON THE BLACK FUSELAGE ON HIS FW. 190 WAS EMBLAZONED AN ACE OF SPADES - THE CARD OF DEATH')

['german', 'franz von stein', 'luftwaffe', 'f w . 190']

## Similarity measure

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md
import spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
# Load the pre-trained word embedding model
nlp = spacy.load('en_core_web_md')

# compute similarity 
def similarity_measure (target_word, words_to_compare):
  # Compute the vector representation of the target word
  target_vector = nlp(target_word).vector
  
  # Compute the vector representation of each word to compare
  vectors_to_compare = [nlp(word).vector for word in words_to_compare]
  
  # Compute the cosine similarity between the target vector and each vector to compare
  similarities = [np.dot(target_vector, vector) / (np.linalg.norm(target_vector) * np.linalg.norm(vector)) for vector in vectors_to_compare]
  
  # Find the index of the word with the highest similarity score
  max_index = np.argmax(similarities)
  
  # Get the corresponding word
  most_similar_word = words_to_compare[max_index]

  return most_similar_word

In [None]:
def similarity_measure_with_threshold(target_word, words_to_compare, threshold):
    # Compute the vector representation of the target word
    target_vector = nlp(target_word).vector

    # Compute the vector representation of each word to compare
    vectors_to_compare = [nlp(word).vector for word in words_to_compare]

    # Compute the cosine similarity between the target vector and each vector to compare
    similarities = [np.dot(target_vector, vector) / (np.linalg.norm(target_vector) * np.linalg.norm(vector)) for vector in vectors_to_compare]

    # Find the word with the maximum similarity score
    max_similarity = max(similarities)
    most_similar_word = words_to_compare[similarities.index(max_similarity)]

    # If the maximum similarity score is above the threshold, return the most similar word
    if max_similarity >= threshold:
      return most_similar_word
    else:
      return None


In [None]:
similarity_measure ('airplane', ['FW.190','LUFTWAFFE','BOMBER'])

  similarities = [np.dot(target_vector, vector) / (np.linalg.norm(target_vector) * np.linalg.norm(vector)) for vector in vectors_to_compare]


'FW.190'

## Visualization for Method 1

In [None]:
def entity_pair_matching_naturalistic_with_visualization(panel,sentence,output_folder):
    TE = TE1(sentence) # alternative TE2
    if not TE:
        return('there is no possible TE')
    
    # Get the detected objects
    detected_objects = object_detection(panel)

    # Loop over the detected objects
    results = []
    for obj in detected_objects:
        class_name = obj["class_name"]
        #if class_name != "person":
        IE = class_name # find the possible IE 
        most_similar_word = similarity_measure(IE, TE)
        obj["most_similar_word"] = most_similar_word
        results.append(obj)

    # display image
    img = Image.open(panel)
    fig, ax = plt.subplots()

    # Assign a different color to each image and text pair
    color_map = {}
    for i, result in enumerate(results):
        most_similar_word = result["most_similar_word"]
        if most_similar_word not in color_map:
            color_map[most_similar_word] = (random.random(), random.random(), random.random())
        color = color_map[most_similar_word]

        # Draw the bounding box and add the text label
        bbox = result["bbox"]
        ax.add_patch(plt.Rectangle((bbox[0], bbox[1]), bbox[2]-bbox[0], bbox[3]-bbox[1], fill=False, edgecolor=color, linewidth=2))
        ax.text(bbox[0], bbox[1]-10, most_similar_word, fontsize=8, color=color)

    ax.imshow(img)
    ax.axis("off")
    plt.savefig(os.path.join(output_folder, filename.replace('.txt', '.jpg')), dpi=300)
    plt.close()

    return results


In [None]:
# entity_pair_matching_naturalistic_with_visualization('test/4005756399.jpg','the man with pierrced ears is wearing glasses and an orange hat','output')

In [None]:
# entity_pair_matching_naturalistic_with_visualization('36979.jpg','A group of friends playing cards and trying to bluff each other into making a terrible mistake.')

In [None]:
# entity_pair_matching_naturalistic_with_visualization('76466808.jpg','Two pilots are standing and talking in front of a British Airways airplane')

In [None]:
# visualize the entire image folder (use jpg and txt file would be sufficient for this task)
folder_path = "test"
output_folder = "output_M1"

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        sentence_file = os.path.join(folder_path, filename)
        sentences = get_sentence_data(sentence_file)
        sentences = sentences[0]['sentence']

        image_file = os.path.join(folder_path, filename.replace('.txt', '.jpg'))

        entity_pair_matching_naturalistic_with_visualization(image_file, sentences, output_folder)

# Method 2-CLIP
Apply the CLIP model on naturalistic images 

In [None]:
from transformers import CLIPProcessor, CLIPModel

In [None]:
def clip_based(image_path, sentence):
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

    # Load the image
    image = Image.open(image_path)
    TE = TE2(sentence) # alternative TE1
    if not TE:
      return []

    # Preprocess the inputs
    inputs = processor(text=TE, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)

    # Run the model and calculate the probabilities
    if outputs is not None:
      probs = outputs.logits_per_image.softmax(dim=1)[0]

    # Create a dictionary containing the text and its corresponding probabilities
    label_probs_dict = {}
    for i in range(len(TE)):
        label_probs_dict[TE[i]] = float(probs[i])
    
    # Filter out scores lower than 0.7 and sort the dictionary based on score
    # label_probs_dict = {k: v for k, v in label_probs_dict.items() if v > 0.7}
    label_probs_dict = dict(sorted(label_probs_dict.items(), key=lambda x: x[1], reverse=True))

    return label_probs_dict

In [None]:
clip_based('test/76466808.jpg','Two pilots are standing and talking in front of a British Airways airplane')

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


Downloading pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

{'british airways': 1.0}

In [None]:
def process_nat_images_in_folder(folder_path, output_file):
    # Define a list to store the results
    results_list = []

    # Loop through all the files in the directory
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            sentence_file = os.path.join(folder_path, filename)
            sentences = get_sentence_data(sentence_file)
            sentences = sentences[0]['sentence']

            image_file = os.path.join(folder_path, filename.replace('.txt', '.jpg'))
            label_probs_dict = clip_based(image_file, sentences)

            # Extract the text with highest probability from the dictionary
            if label_probs_dict:
                text = list(label_probs_dict.keys())[0]
                # Add the filename and result to the list
                results_list.append((filename, text))

    # Create a Pandas dataframe from the list
    df = pd.DataFrame(results_list, columns=["filename", "result"])
    df.sort_values("filename", inplace=True)

    # Save the DataFrame as a CSV file
    df.to_csv(output_file, index=False)

    return df

In [None]:
process_nat_images_in_folder("test", "output.csv")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will

Unnamed: 0,filename,result
16,101262930.txt,asian
5,144571886.txt,chevrolet
4,2113996953.txt,asian
0,2657844508.txt,il porto
3,3179223972.txt,elm o
14,3381788544.txt,asian
10,3422458549.txt,african american
15,4507048434.txt,fl an
11,4550700462.txt,mexican
8,4764087782.txt,african american
