## Gemini

This is just some messing around with the `example_hists` files.

### Docs:

* in theory, there is documentation here: https://ai.google.dev/gemini-api/docs, 
* Here though, I just asked claude again :D

#### Key Points (cp-claude)

* API Key: Get your API key from the [Google Console](https://aistudio.google.com/app/apikey) -- note that you either have to create a new GCP project OR create the API key in an existing project
* Then the rest of the steps is similar for Claude/ChatGPT


First, install anthropic api (also, see .yml file for the environment for this project)

In [1]:
#!pip install google-generativeai pillow

Where are things stored/going to be stored?

In [1]:
dir_api = '/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/LLM_outputs/gemini_api/' #store API results for example-hists

key_file = '/Users/jnaiman/.gemini/key.txt'

jsons_dir = '/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/jsons/' # directory where jsons created with figure are stored
imgs_dir = '/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/imgs/' # where images are stored

# for saving temp images for reading in
tmp_dir = '/Users/jnaiman/Downloads/tmp/'

img_format = 'jpeg'

In [2]:
import base64
from PIL import Image
import numpy as np
import json
import re
import pickle
import os
from glob import glob
import google.generativeai as genai

# debug
from importlib import reload


from sys import path
path.append('../')
import utils.llm_utils
reload(utils.llm_utils)
from utils.llm_utils import parse_qa, load_image, get_img_json_pair, parse_for_errors

import time

In [3]:
# setup
with open(key_file,'r') as f:
    api_key = f.read()

# Configure the API key
genai.configure(api_key=api_key.strip())

In [None]:
model='gemini-1.5-flash'
model_gemini = genai.GenerativeModel(model)

In [4]:
jsons_to_parse = glob(jsons_dir + '/*.json')
jsons_to_parse[:3]

['/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/jsons/nclust_3_trial9.json',
 '/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/jsons/nclust_5_trial3.json',
 '/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/jsons/nclust_2_trial0.json']

In [19]:
def send_to_gemini(question_list, image_path, model_gemini,
                    #tmp_dir = '/Users/jnaiman/Downloads/tmp/',
                    test_run = True, 
                    #fac=1.0, 
                    verbose=True,
                    system_prompt = None,                   
                   #model='gemini-1.5-flash',
                   large_image= False):
    """
    Sends an image + question to Gemini.  Does something different for large file, but might be bad idea.

    system_prompt : set to None to use default from question list
    """
    if system_prompt is None:
        system_prompt = question_list['persona']

    err = False
    #model_gemini = genai.GenerativeModel(model)
    #if verbose: print('Model loaded:', model)
    prompt_save = ''; prompt = ''; response = ''
    if not test_run:
        question = question_list['context'] + " " + question_list['question'] + " " + question_list['format']
        # lowercase the first word, just in case
        question = question.lstrip() # no whitespace
        question = question[0].lower() + question[1:]
        if verbose: print('   on question:',question)
        # Prepare the API request
        prompt = f"I am going to show you an image. Now, {question}"
        prompt_save = f"I am going to show you an image. Now, {question}"
        try:
        #if True:

            # Wait for processing (for video files, you might need to wait longer)
            if large_image:
                # Upload the file to Gemini
                uploaded_file = genai.upload_file(path=image_path)
                import time
                while uploaded_file.state.name == "PROCESSING":
                    time.sleep(1)
                    uploaded_file = genai.get_file(uploaded_file.name)
            else:
                uploaded_file = Image.open(image_path)
            
            # Generate content with the uploaded file
            response = model_gemini.generate_content([prompt, uploaded_file])
            
            if large_image:
                # Clean up - delete the uploaded file
                genai.delete_file(uploaded_file.name)
            
            #return response.text
        
        except Exception as e:
            print('[ERROR]:', str(e))
            err = True

            #return f"Error: {str(e)}"
        
    if not test_run and not err:
        # Get the response from the API
        answer = response.text 
        question_list['raw answer'] = answer
        # also calculate usage
        usage = {
            'input_tokens': response.usage_metadata.prompt_token_count,
            'output_tokens': response.usage_metadata.candidates_token_count,
            'total_tokens': response.usage_metadata.total_token_count,
            'cached_content_tokens': getattr(response.usage_metadata, 'cached_content_token_count', 0)
            }

        question_list['usage'] = usage
        if verbose:
            print(f"      - Input tokens: {usage.input_tokens}")
            print(f"      - Output tokens: {usage.output_tokens}")
            print(f"      - Total tokens: {usage.input_tokens + usage.output_tokens}")
        # format answer
        answer_format = answer.split('```json"')[-1].split('\n')[0].replace('\n', '')
        #answer.replace("```json\n",'').replace("\n```",'')
        try:
            question_list['Response'] = json.loads(answer_format)
        except:
            question_list['Response'] = answer_format
            question_list['Error'] = 'JSON formatting'
        question_list['Response String'] = answer_format
    elif err:
        question_list['raw answer'] = 'ERROR'
    else:
        question_list['Response'] = 'TEST RUN'
        question_list['Response String'] = 'TEST RUN'
    

    return question_list, prompt_save, system_prompt

In [28]:
def parse_for_errors_claude(qa_in, verbose=True, llm = 'claude'):
    # there is some "doubling up" of strings, so clean up a bit
    direct_copy_list = ['Q', 'A', 'Level', 'type', 'persona', 
                        'context', 'question', 'format', 'plot number', 
                        'usage', 'prompt', 'system prompt',
                        'Response', 'Response String'] # these last 2 will be overwritten
    qa_out = []
    for qa_pairs in qa_in:
        if verbose:
            print('Prompt:', qa_pairs['prompt'])
            print('  Real A:', qa_pairs['A'])
        #response = qa_pairs['Response'].split('```json')
        response_claude_raw = qa_pairs['raw answer']
        response_claude = ''
        try:
            if '```json' in response_claude_raw: # ideal
                response_claude = response_claude_raw.split('```json')[-1].split('```')[0].replace('\n','')
                response_claude = json.loads(response_claude)
            elif '{"' in response_claude_raw: # less ideal
                response_claude = '{"' + response_claude_raw.split('{"')[-1].replace('\n','')
                response_claude = json.loads(response_claude)
        except json.JSONDecodeError: # last ditch effort
            # Extract JSON if there's extra text
            json_match = re.search(r'\{.*\}', response_claude_raw, re.DOTALL)
            if json_match:
                try:
                    response_claude = json.loads(json_match.group())
                except json.JSONDecodeError:
                    pass
        if verbose:
            if response_claude == '':
                print("ERROR IN "+llm.upper()+" PARSE")
            else:
                print(llm.capitalize() + ' A:', response_claude)
            print('')
        # now clean up
        qa_dir = {}
        for dc in direct_copy_list:
            qa_dir[dc] = qa_pairs[dc]
        # overwrite the last two
        qa_dir['Response String'] = qa_pairs['raw answer'] # full, un filtered answer
        qa_dir['Response'] = response_claude
        qa_out.append(qa_dir.copy())
    return qa_out

In [43]:
iMax = 2
verbose = False
test_run = False # run w/o actually pinging openai
restart = False
# set system_prompt to None to default to what is in question list
system_prompt = """You are a helpful assistant that responds only in valid JSON format. Do not include any explanations, reasoning, or text outside of the JSON response."""
#system_prompt = """You must respond with only valid JSON. Start your response immediately with { and end with }. Do not write any text before or after the JSON."""
# temperature=0.1



for ijson,json_path in enumerate(jsons_to_parse):
    if ijson >= iMax:
        continue

    print('on', ijson, 'of', iMax)

    # get image and base json
    img_path = imgs_dir + json_path.split('/')[-1].removesuffix('.json') + '.' + img_format
    _, img_format_media, base_json, err = get_img_json_pair(img_path, json_path, 
                                                            dir_api, restart=restart,
                                                      tmp_dir=tmp_dir, load_image=False)
    if err:
        continue
    if verbose: print('Got image!')

    ###### create QA ########
    qa = []
    
    for k,v in base_json['VQA']['Level 1']['Figure-level questions'].items():
        out = {'Q':v['Q'], 'A':v['A'], 'Level':'Level 1', 'type':'Figure-level questions', 'Response':""}
        qa.append(out)
    
    # what kinds?
    types = ['(words + list)', '(words)']
    
    # get uniques
    level_parse = 'Level 1'
    plot_level = 'Plot-level questions'
    qa = parse_qa(level_parse, plot_level, qa, base_json['VQA'], types)
    
    level_parse = 'Level 2'
    plot_level = 'Plot-level questions'
    qa = parse_qa(level_parse, plot_level, qa, base_json['VQA'], types)
    
    level_parse = 'Level 3'
    plot_level = 'Plot-level questions'
    qa = parse_qa(level_parse, plot_level, qa, base_json['VQA'], types)

    responses = []; prompts = []; system_prompts = []
    for question_list in qa:
        response, prompt, system_prompt_out = send_to_gemini(question_list, img_path, model_gemini,
                    test_run = test_run, 
                    verbose=verbose,
                    system_prompt = system_prompt)
        responses.append(response)
        question_list['prompt'] = prompt
        question_list['system prompt'] = system_prompt_out


    # parse for errors
    qa = parse_for_errors_claude(qa, llm='claude')
    print('')
    print('**** Cleaned QA ****')
    qa = parse_for_errors(qa) # might need to do this again

    # dump to file
    if not test_run:
        with open(dir_api + json_path.split('/')[-1].removesuffix('.json')+ '.pickle', 'wb') as ff:
            pickle.dump([qa, model], ff)
        print('Just saved:', dir_api + json_path.split('/')[-1].removesuffix('.json')+ '.pickle')
    else:
        print('Would store at:', dir_api + json_path.split('/')[-1].removesuffix('.json')+ '.pickle')
    #import sys; sys.exit()

on 0 of 2
Prompt: I am going to show you an image. Now, how many bars are there in the specified figure panel? Please format the output as a json as {"nbars":""} for this figure panel, where the "nbars" value should be an integer.
  Real A: 50
Claude A: {'nbars': 40}

Prompt: I am going to show you an image. Now, what are the maximum data values in this figure panel?  Please format the output as a json as {"maximum x":""} for this figure panel, where the "maximum" value should be a float, calculated from the data values used to create the plot.
  Real A: 0.49755257997301794
Claude A: {'maximum x': 0.5}

Prompt: I am going to show you an image. Now, what are the mean data values in this figure panel?  Please format the output as a json as {"mean x":""} for this figure panel, where the "mean" value should be a float, calculated from the data values used to create the plot.
  Real A: 0.34264837991488106
Claude A: {'mean x': 0.34}

Prompt: I am going to show you an image. Now, what are the

## Look at data

Check out one, if you wanna:

In [44]:
pickles = glob(dir_api + '*.pickle')
#pickles = glob('/Users/jnaiman/Downloads/tmp/JCDL2025/example_hists/claude_api/*pickle')
pickles[:5]

['/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/LLM_outputs/gemini_api/nclust_5_trial3.pickle',
 '/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/LLM_outputs/gemini_api/nclust_3_trial9.pickle']

In [45]:
ifile = 1
with open(pickles[ifile], 'rb') as f:
    qa_in = pickle.load(f)[0]

In [46]:
qa_in[0]

{'Q': 'You are a helpful assistant that can analyze images.  How many bars are there in the specified figure panel? Please format the output as a json as {"nbars":""} for this figure panel, where the "nbars" value should be an integer.',
 'A': 50,
 'Level': 'Level 1',
 'type': 'Plot-level questions',
 'persona': 'You are a helpful assistant that can analyze images.',
 'context': '',
 'question': 'How many bars are there in the specified figure panel?',
 'format': 'Please format the output as a json as {"nbars":""} for this figure panel, where the "nbars" value should be an integer.',
 'plot number': 'plot0',
 'usage': {'input_tokens': 310,
  'output_tokens': 13,
  'total_tokens': 323,
  'cached_content_tokens': 0},
 'prompt': 'I am going to show you an image. Now, how many bars are there in the specified figure panel? Please format the output as a json as {"nbars":""} for this figure panel, where the "nbars" value should be an integer.',
 'system prompt': 'You are a helpful assistant t

In [47]:
#qa_in = parse_for_errors_claude(qa_in, llm='gemini')

Claude outputs reasoning, so we have to do a bit of cleaning from the responses:

In [48]:
print(pickles[ifile])
print('*********')
for qa_pairs in qa_in:
    print('Prompt:', qa_pairs['prompt'])
    print('  Real A:', qa_pairs['A'])
    print('Claude A:', qa_pairs['Response'])
    print('')

/Users/jnaiman/LLM_VQA_JCDL2025/example_hists/LLM_outputs/gemini_api/nclust_3_trial9.pickle
*********
Prompt: I am going to show you an image. Now, how many bars are there in the specified figure panel? Please format the output as a json as {"nbars":""} for this figure panel, where the "nbars" value should be an integer.
  Real A: 50
Claude A: {'nbars': 40}

Prompt: I am going to show you an image. Now, what are the maximum data values in this figure panel?  Please format the output as a json as {"maximum x":""} for this figure panel, where the "maximum" value should be a float, calculated from the data values used to create the plot.
  Real A: 0.49755257997301794
Claude A: {'maximum x': 0.5}

Prompt: I am going to show you an image. Now, what are the mean data values in this figure panel?  Please format the output as a json as {"mean x":""} for this figure panel, where the "mean" value should be a float, calculated from the data values used to create the plot.
  Real A: 0.342648379914