# BLIP+gpt4o-mini

## process images

In [49]:
import json
import os
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
json_file="problems.json"

In [53]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
image_descriptions = {}
def get_image_description(image_path, json_filename="image_descriptions.json"):
    key = os.path.basename(os.path.dirname(image_path)) if image_path else None
    if os.path.exists(json_filename):
        with open(json_filename, "r", encoding="utf-8") as f:
            try:
                image_descriptions.update(json.load(f))
            except json.JSONDecodeError:
                print("JSON decode error, starting with an empty dictionary.")

    if image_path and os.path.exists(image_path):
        try:
            image = Image.open(image_path)
            inputs = processor(images=image,return_tensors="pt")
            out = model.generate(**inputs, max_new_tokens=100,num_beams=1,do_sample=True,temperature=0.1)  
            description = processor.decode(out[0], skip_special_tokens=True)
            image_descriptions[key] = description
            with open(json_filename, "w", encoding="utf-8") as f:
                json.dump(image_descriptions, f, ensure_ascii=False, indent=4)
            return description  
        except Exception as e:
            print(f"Error processing image with BLIP: {e}")
            return "Error generating description."
    else:
        return "This is a text-based question without an image."


## Run Gpt

In [54]:
from langchain_openai import ChatOpenAI
os.environ["OPENAI_API_KEY"] = "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"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0,max_tokens=250)

In [None]:
def display_image(image_path):
    try:
        image=Image.open(image_path)
        plt.imshow(image)
        plt.axis('off')
        plt.show()
    except FileNotFoundError:
        print(f"Image not found at {image_path}")
import time


results = {}
outputs = {}

def answer_question_with_image(data_entry, image_folder, max_retries=3):
    key, value = data_entry
    image_file = value.get('image')

    
    if image_file:
        image_path = os.path.join(image_folder, key, image_file)
    else:
        image_path = None  

    question = value['question']
    choices = ", ".join(value['choices'])
    hint=value['hint']

    retry_count = 0
    while retry_count < max_retries:
        image_description = get_image_description(image_path)
        combined_input = f"""Image description: {image_description}
        Question: {question}
        Choices: {choices}
        hint: {hint}
        Please respond in the following JSON format exactly without any additional text,only one answer:
        {{
            "result": {{"{key}": Answer index (0 for first choice, 1 for second, etc.)}},
            "outputs": {{
                "{key}": "The answer is A for answer index 0, B for 1, C for 2, D for 3, because: [concise explanation]"
            }}
        }}
        """

        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": combined_input}
        ]
        
        answer = llm.invoke(input=messages)
        response_content = answer.content
        print(f"model response content (Attempt {retry_count + 1}): {response_content}")
        
        
        try:
            response_data = json.loads(response_content)  
            results[key] = response_data["result"][key] 
            outputs[key] = response_data["outputs"][key] 
            save_results_and_outputs()
            return response_data["outputs"][key]
        
        except json.JSONDecodeError:
            print("Model response is not in JSON format. Retrying...")
            retry_count += 1
            time.sleep(1)  

    
    print("Max retries reached. Failed to get valid JSON response.")
    results[key] = -1  
    outputs[key] = "Error: Model response was not in JSON format after multiple attempts."
    save_results_and_outputs()
    return "Answer not identified"

In [56]:
import time
def calculate_accuracy(data, image_folder):
    total = 0
    correct = 0
    for key, value in data.items():
        if value.get("split") == "test":
            predicted_answer = answer_question_with_image((key, value), image_folder)
            real_answer_index =int(value['answer'])
            predicted_index = results.get(key, -1)
            print(f"Predicted Index: {predicted_index}, Real Index: {real_answer_index}")
            if predicted_index == real_answer_index:
                correct += 1
            total += 1
            print(f"Progress: {total} items processed, Correct: {correct}")

    accuracy = correct / total if total > 0 else 0
    print(f"Total: {total}, Correct: {correct}, Accuracy: {accuracy:.2%}")
    return accuracy
def save_results_and_outputs(filename="blip_gpt_output.json"):
    try:
    
        if os.path.exists(filename):
            with open(filename, "r", encoding="utf-8") as f:
                final_output = json.load(f)
        else:
            final_output = {"results": {}, "outputs": {}}
        final_output["results"].update(results)
        final_output["outputs"].update(outputs)
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(final_output, f, ensure_ascii=False, indent=4)
        print("Results and outputs appended to JSON file successfully.")
    
    except json.JSONDecodeError:
        print("Error decoding JSON from file, starting with a new dictionary.")
        final_output = {"results": results, "outputs": outputs}
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(final_output, f, ensure_ascii=False, indent=4)

json_path="./problems.json"
image_folder="./test/test"

with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

calculate_accuracy(data, image_folder)


JSON decode error, starting with an empty dictionary.
model response content (Attempt 1): {
    "result": {"4": 1},
    "outputs": {
        "4": "The answer is B for answer index 1, because: The text addresses the goddess directly, which is an example of apostrophe."
    }
}
Error decoding JSON from file, starting with a new dictionary.
Predicted Index: 1, Real Index: 1
Progress: 1 items processed, Correct: 1
JSON decode error, starting with an empty dictionary.
model response content (Attempt 1): {
    "result": {"5": 1},
    "outputs": {
        "5": "The answer is B for answer index 1, because: Gordon's test could show how steady a parachute with a 1 m vent was at 200 km per hour, which is what he was specifically observing in the wind tunnel."
    }
}
Results and outputs appended to JSON file successfully.
Predicted Index: 1, Real Index: 1
Progress: 2 items processed, Correct: 2
model response content (Attempt 1): {
    "result": {"11": 1},
    "outputs": {
        "11": "The answ

0.8144305588304646