In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import numpy as np
from libzim.reader import Archive
from libzim.search import Query, Searcher
from libzim.suggestion import SuggestionSearcher

from pipeline.prompt_templates import generic_prompt
from llm_and_vision_node import Ros_llm_vision_talker

from llava_utils import instantiate_llava, LangChainLLavaWrapper

default_llm_device = 'cuda:1'

# ROS linker to camera
T = Ros_llm_vision_talker(camera_input_topic = "/realsense/color/image_raw",
                         spin=False)
T.load_image('img_db/0.jpg') # load img in case camera's not working

# Vision-Language model (obviously including LLM).
tokenizer, model, image_processor, context_len = instantiate_llava(device = default_llm_device)

image_embedding_model = model.get_vision_tower() # CLIP encoder from LLaVa (frozen, not re-trained).

prompt_tmpl = generic_prompt(chat_history_field = True)

llm = LangChainLLavaWrapper(tokenizer = tokenizer, model = model, image_processor = image_processor, image_input = T, use_image_input=1,
                           device = default_llm_device)
llm.default_prompt = prompt_tmpl # for VQA as a LLM tool, the LLM will only input the text request(i.e. 'caption img') so we need a default prompt template.


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

[2023-12-14 09:30:17,688] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[INFO] [1702546220.403876]: VLM and vision service initialized


Downloading config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

# For each image in dir, Generate partially unfilled JSON training data. (User can fill questions and answers manually)

In [53]:
import os
import json
import copy
import uuid
import typing
from typing import List
# Add this prefix so JSONs can be distinguished from labelme segmentation label JSONs.
json_prefix = 'q_' # JSON name = json_prefix + img_name + .json.

folder = 'img_db/testboard/'

def generate_image_json_training_data(folder: str,
                                      json_prefix: str = "q_",
                                      allowed_img_formats: List[str] = [".jpg"],
                                      human_question_templates: List[str] = ["<image>\n",
                                                                             "How to disassemble?"]):
    """Generate JSON files for image-based conversations.

    Keyword arguments:
    folder -- the folder where images are located and where JSON files will be created
    json_prefix -- the string prefix that will be added to JSON file name. (to prevent collisions with labelme segmentation annotations, which are i.e. 0.json"
    allowed_img_formats -- JSONs will only be created for files (images) that are in one of these formats
    #human_question_templates -- each JSON can contain several distinct conversations. For each template in human_question_templates, a conversation dict will
    #be created, where the (human question) will be one of the elements in human_question_templates.
    
    """
    # Each template must contain the image token!
    image_token = "<image>\n"
    for i, tmp in enumerate(human_question_templates):
        if image_token not in tmp:
            human_question_templates[i] = image_token + tmp

    json_template = {"id": "unique_id",
                "image": 'img_filepath',
                "conversations":[
                          {
                            "from": "human",
                            "value": "<image>\nDescribe the image."
                          },
                          {
                            "from": "gpt",
                            "value": ""
                          },]
                }
    conv_human_question_template = {
                            "from": "human",
                            "value": "<image>\nDescribe the image."
    }
    conv_ai_answer_template = {
                            "from": "gpt",
                            "value": ""
                          }

    # Keep only img files with allowed_img_formats
    all_files = os.listdir(folder)
    good_files = []
    for file in all_files:
        for format in allowed_img_formats:
            if format in file:
                good_files.append([file, file.split(format)[0]]) # A list of [["img.jpg", "img"]
                
    # Make and save jsons for all files
    for file_and_format, file in good_files:
        template = copy.deepcopy(json_template)
        template['id'] = uuid.uuid4().__str__() # bing, calculate probability of collision :D 
        template['image'] = folder+'/'+file_and_format
        json_name = json_prefix + file + '.json'
        full_json_save_path = folder + '/' + json_name

        # Create conversation list of dicts, for each template in human_question_templates
        conversations = []
        #tmp_human_template = copy.deepcopy(conv_human_question_template)
        for template_question in human_question_templates:
            conv_human_question_template["value"] = template_question
            #print(template_question)
            conversations.append(copy.deepcopy(conv_human_question_template))

            # TODO use VLM to provide "initial guess" answer
            conversations.append(conv_ai_answer_template)
            
        #print(conversations)
        template['conversations'] = conversations
        #print(full_json_save_path)
        #json_string = json.dumps(obj = [template], indent = 2)
        #print(json_string)
        with open(full_json_save_path, 'w') as f:
            json.dump([template], f)
        #print(template)
    #print(template)

generate_image_json_training_data(folder = 'img_db/testboard')
generate_image_json_training_data(folder = 'img_db')

# Use LLM to generate "human" questions

# Testing opening ZIM archives

In [None]:
zim = Archive("test.zim")
print(f"Main entry is at {zim.main_entry.get_item().path}")
entry = zim.get_entry_by_path("home/fr")
print(f"Entry {entry.title} at {entry.path} is {entry.get_item().size}b.")
print(bytes(entry.get_item().content).decode("UTF-8"))

# searching using full-text index
search_string = "Welcome"
query = Query().set_query(search_string)
searcher = Searcher(zim)
search = searcher.search(query)
search_count = search.getEstimatedMatches()
print(f"there are {search_count} matches for {search_string}")
print(list(search.getResults(0, search_count)))

# accessing suggestions
search_string = "kiwix"
suggestion_searcher = SuggestionSearcher(zim)
suggestion = suggestion_searcher.suggest(search_string)
suggestion_count = suggestion.getEstimatedMatches()
print(f"there are {suggestion_count} matches for {search_string}")
print(list(suggestion.getResults(0, suggestion_count)))