In [13]:
import torch
torch.hub.download_url_to_file(
    'https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/multi_col_1229.png', 
    'chart_example_1.png'
)

100%|██████████| 34.2k/34.2k [00:00<00:00, 15.9MB/s]


# Chart Question Answering

## ChartInstruct

In [22]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
import torch

model = LlavaForConditionalGeneration.from_pretrained("ahmed-masry/ChartInstruct-LLama2", torch_dtype=torch.float16)
processor = AutoProcessor.from_pretrained("ahmed-masry/ChartInstruct-LLama2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

image_path = "chart_example_1.png"
input_text = "What is the share of respondants who prefer Whatsapp in the 30-59 age group?"

image = Image.open(image_path).convert("RGB")
input_prompt = f"<image>\n Question: {input_text} Answer: "

inputs = processor(text=input_prompt, images=image, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

inputs['pixel_values'] = inputs['pixel_values'].to(torch.float16)

prompt_length = inputs['input_ids'].shape[1]

generate_ids = model.generate(**inputs, num_beams=4, max_new_tokens=512)
output_text = processor.batch_decode(generate_ids[:, prompt_length:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print("Model Output:", output_text)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


Model Output: 25% 


# Chart-to-Table Conversion

## Chart-To-Table

In [23]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image

model_name = "khhuang/chart-to-table"
model = VisionEncoderDecoderModel.from_pretrained(model_name).cuda()
processor = DonutProcessor.from_pretrained(model_name)

image_path = "chart_example_1.png"

input_prompt = "<data_table_generation> <s_answer>"

img = Image.open(image_path)
pixel_values = processor(img.convert("RGB"), random_padding=False, return_tensors="pt").pixel_values
pixel_values = pixel_values.cuda()
decoder_input_ids = processor.tokenizer(input_prompt, add_special_tokens=False, return_tensors="pt", max_length=510).input_ids.cuda()#.squeeze(0)


outputs = model.generate(
        pixel_values.cuda(),
        decoder_input_ids=decoder_input_ids.cuda(),
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=4,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )
    

sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
extracted_table = sequence.split("<s_answer>")[1].strip()
extracted_table

config.json:   0%|          | 0.00/4.97k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    960,
    960
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_version": "4.46.3",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add_f

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/510 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.01M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/355 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'TITLE | Share of Facebook Messenger users in the United States as of January 2018, by age group &&& Characteristic | Facebook Messenger | WhatsApp &&& 18-29 | 73% | 30% &&& 30-59 | 66% | 25% &&& 60+ | 43% | 6%'

In [24]:
import pandas as pd

data_string = extracted_table.replace('&&&', '\n')
lines = data_string.split('\n')

has_title = "TITLE" in data_string

header_lines = [line.split(' | ') for line in lines if '|' in line][:2 if has_title else 1]

header_length = len(header_lines[-1])
lines = lines[2:] if has_title else lines[1:]

data = []
for line in lines:
    try:
        parts = line.split(' | ')
        data.append(parts)
    except ValueError:
        pass

df = pd.DataFrame(data, columns=header_lines[-1])
df

Unnamed: 0,Characteristic,Facebook Messenger,WhatsApp
0,18-29,73%,30%
1,30-59,66%,25%
2,60+,43%,6%


## DePlot

In [25]:
from PIL import Image
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
import pandas as pd

image = Image.open(r'chart_example_1.png')

processor = Pix2StructProcessor.from_pretrained("google/deplot")
model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")

description = "Generate underlying data table of the figure below:"

encoding = processor(image, description, return_tensors="pt")

generated_ids = model.generate(**encoding, max_new_tokens=512)
data_string = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(data_string)

data_string = data_string.replace('<0x0A>', '\n')
lines = data_string.split('\n')

has_title = "TITLE" in data_string

header_lines = [line.split(' | ') for line in lines if '|' in line][:2 if has_title else 1]

header_length = len(header_lines[-1])
lines = lines[2:] if has_title else lines[1:]

data = []
for line in lines:
    try:
        parts = line.split(' | ')
        data.append(parts)
    except ValueError:
        pass

df = pd.DataFrame(data, columns=header_lines[-1])
df

preprocessor_config.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/851k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

Arial.TTF:   0%|          | 0.00/276k [00:00<?, ?B/s]

Characteristic | Facebook Messenger | WhatsApp <0x0A> 18-29 | 73% | 30% <0x0A> 30-59 | 66% | 25% <0x0A> 60+ | 43% | 6%


Unnamed: 0,Characteristic,Facebook Messenger,WhatsApp
0,18-29,73%,30%
1,30-59,66%,25%
2,60+,43%,6%


# Chart Fact-Checking

## ChartVE

In [26]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image

model_name = "khhuang/chartve"
model = VisionEncoderDecoderModel.from_pretrained(model_name).cuda()
processor = DonutProcessor.from_pretrained(model_name)

image_path = "chart_example_1.png"

def format_query(sentence):
    return f"Does the image entails this statement: \"{sentence}\"?"

# Format text inputs
CAPTION_SENTENCE = "Share of Facebook Messenger users in the United States as of January 2018, by age group."
query = format_query(CAPTION_SENTENCE)

# Encode chart figure and tokenize text
img = Image.open("chart_example_1.png")
pixel_values = processor(img.convert("RGB"), random_padding=False, return_tensors="pt").pixel_values
pixel_values = pixel_values.cuda()
decoder_input_ids = processor.tokenizer(query, add_special_tokens=False, return_tensors="pt", max_length=510).input_ids.cuda()


outputs = model(pixel_values, decoder_input_ids=decoder_input_ids)

# positive_logit = outputs['logits'].squeeze()[-1,49922]
# negative_logit = outputs['logits'].squeeze()[-1,2334] 

# Probe the probability of generating "yes"
binary_entail_prob_positive = torch.nn.functional.softmax(outputs['logits'].squeeze()[-1,[2334, 49922]])[1].item()
binary_entail_prob_positive

Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    960,
    960
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_version": "4.46.3",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add_f

0.9958046078681946

# Autogen

In [1]:
!pip install git+https://github.com/microsoft/autogen.git@0.2

Collecting git+https://github.com/microsoft/autogen.git@0.2
  Cloning https://github.com/microsoft/autogen.git (to revision 0.2) to /tmp/pip-req-build-aao2mkmf
  Running command git clone --filter=blob:none --quiet https://github.com/microsoft/autogen.git /tmp/pip-req-build-aao2mkmf
  Running command git checkout -b 0.2 --track origin/0.2
  Switched to a new branch '0.2'
  Branch '0.2' set up to track remote branch '0.2' from 'origin'.
  Resolved https://github.com/microsoft/autogen.git to commit 3b4c0170b058a3629bfc58a75e1bc22e2b9a29eb
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting openai>=1.3 (from autogen-agentchat==0.2.40)
  Downloading openai-1.57.4-py3-none-any.whl.metadata (24 kB)
Collecting diskcache (from autogen-agentchat==0.2.40)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting flaml (from autogen-agentchat==0.2.40)


In [2]:
from autogen import (Agent, AssistantAgent, GroupChat, GroupChatManager, UserProxyAgent, config_list_from_json,)

In [None]:
config_list = [
    {
        "model": "gemini-1.5-flash-latest",
        "api_key": "YOUR_API_KEY",
        "base_url": "https://generativelanguage.googleapis.com/v1beta",
        "api_type": "custom",
        "tags": ["gemini", "google"]
    }
]

## Boss (Agent đưa ra câu hỏi)

In [4]:
boss = UserProxyAgent(
    name="Boss",
    # human_input_mode="ALWAYS",
    system_message="The boss who ask questions and give tasks.",
    code_execution_config={"work_dir": "coding", "use_docker": False}
)

## Planner (Agent phân loại câu hỏi đầu vào)

In [5]:
planner = AssistantAgent(
    name="Planner",
    system_message="""
    You are a helpful AI assistant.
    Your tasks involve working with charts to provide accurate and insightful responses. Use your analytical and language skills to classify input questions into the following tasks:

    Answering Questions About Charts (Chart Question Answering):
    This involves providing responses to questions related to the content of a chart.

    Converting Charts to Tables (Chart-to-Table Conversion):
    Extract data from charts and present it in a clear and structured tabular format. Ensure all values, labels, and metadata from the chart are accurately included in the table.

    Verifying Chart Accuracy (Chart Fact-Checking):
    This involves verifying whether a given statement aligns with the input chart.

    (Important) I will only respond by classifying your input into one of these three tasks or providing an answer related to these tasks. If the message does not fall into any of the three categories, return None. Additional requests, such as asking for a chart or other unrelated information, will not be addressed.
    Input message: {input_message}
    """,
    llm_config={"config_list": config_list, "timeout": 60, "temperature": 0},
)

## QAAgent (Agent trả lời các câu hỏi QA) (Chart Question Answering)

In [6]:
qa_agent = AssistantAgent(
    name="QAAgent",
    system_message="""You are a senior Python engineer. Your task is to answer factual questions based on the data or chart images provided. You will generate Python code to analyze charts, images, or other formats to extract the answers. Here are the steps you need to follow:
    1. Load the pre-trained model ahmed-masry/ChartInstruct-LLama2
    2. When gathering information: preparing the device (GPU/CPU), processing the input image and question into tensors, generating the answer using beam search, decoding the output, and printing the result.
    3. When solving the task: - Generate the necessary Python code to solve the task. model = LlavaForConditionalGeneration.from_pretrained("ahmed-masry/ChartInstruct-LLama2", torch_dtype=torch.float16)
processor = AutoProcessor.from_pretrained("ahmed-masry/ChartInstruct-LLama2"). image = Image.open(image_path).convert("RGB");input_prompt = f"<image>\n Question: {input_text} Answer: ";inputs = processor(text=input_prompt, images=image, return_tensors="pt");inputs = {k: v.to(device) for k, v in inputs.items()};inputs['pixel_values'] = inputs['pixel_values'].to(torch.float16);prompt_length = inputs['input_ids'].shape[1];generate_ids = model.generate(**inputs, num_beams=4, max_new_tokens=512);output_text = processor.batch_decode(generate_ids[:, prompt_length:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - Ensure the code is complete and ready to execute without requiring modifications from the user. Do not use raise, if-else, or try-catch to handle errors.
    4. Error handling: - If an error occurs, provide an updated script to fix the issue. Make sure to handle edge cases or potential issues in the data processing.
    5. Task completion: After completing the task and generating the required Python code. Your goal is to generate efficient Python code that leverages the google/deplot model to convert charts into tabular data. The code you generate should be complete, functional, and ready to execute. After generating the Python code, close the ```.
    6. Respond with "TERMINATE" to indicate that everything is done.
    """,
    llm_config={"config_list": config_list, "timeout": 60, "temperature": 0},
    code_execution_config=False
)

## ConversionAgent (Agent chuyển đổi biểu đồ thành bảng) (Chart-to-Table Conversion)

In [7]:
conversion_agent = AssistantAgent(
    name="ConversionAgent",
    system_message="""You are a senior Python engineer. Your task is to generate Python code that uses pre-trained models to analyze charts and generate underlying data. Specifically, you will use the google/deplot model to convert a chart into a data table. Here's how you should approach the task:
    1. When receiving a task: If the task involves generating data from a chart, use the pre-trained model google/deplot via the Pix2StructProcessor and Pix2StructForConditionalGeneration from the Hugging Face transformers library. The task is to take an image of a chart, process it, and output the underlying data as a table. You are expected to generate the Python code that uses this model to perform the task.
    2. When gathering information: You will load the image using Python's PIL.Image.open() and process it using Pix2StructProcessor. You should also prepare a description (e.g., "Generate underlying data table of the figure below:") to pass along with the image.
    3. When solving the task: Generate the Python code to load the model and processor, process the image, and use the model to generate the underlying data table. After obtaining the string returned from the model with max_new_tokens=512 (important), process the string further by data_string = data_string.replace('<0x0A>', '\n'); lines = data_string.split('\n'); has_title = "TITLE" in data_string; header_lines = [line.split(' | ') for line in lines if '|' in line][:2 if has_title else 1]; header_length = len(header_lines[-1]); lines = lines[2:] if has_title else lines[1:]; data = []; for line in lines:try:parts = line.split(' | ');data.append(parts);except ValueError:;pass; df = pd.DataFrame(data, columns=header_lines[-1]). Do not use raise, if-else, or try-catch to handle errors. The code should decode the model's output and print the generated data.
    4. Error handling: If there is an error in the code, provide a corrected version. Ensure the code works efficiently for the task at hand.
    5. Task completion: After completing the task and generating the required Python code. Your goal is to generate efficient Python code that leverages the google/deplot model to convert charts into tabular data. The code you generate should be complete, functional, and ready to execute. After generating the Python code, close the ```.
    6. Respond with "TERMINATE" to indicate that everything is done.
    """,
    llm_config={"config_list": config_list, "timeout": 60, "temperature": 0},
    code_execution_config=False
)

## FactCheckAgent (Agent kiểm tra một đoạn thông tin về biểu đồ) (Chart Fact-Checking)

In [22]:
fact_checking_agent = AssistantAgent(
    name="FactCheckAgent",
    system_message="""You are a senior Python engineer. Your task is to answer factual questions based on the data or chart images provided. You will generate Python code to analyze charts, images, or other formats to extract the answers. Here are the steps you need to follow:
    1. Load the pre-trained model "khhuang/chartve" VisionEncoderDecoderModel and DonutProcessor
    2. When solving the task: - Generate the necessary Python code to solve the task.def format_query(sentence):return f"Does the image entails this statement: \"{sentence}\"?";query = format_query(CAPTION_SENTENCE);img = Image.open("chart_example_1.png");pixel_values = processor(img.convert("RGB"), random_padding=False, return_tensors="pt").pixel_values;pixel_values = pixel_values.cuda();decoder_input_ids = processor.tokenizer(query, add_special_tokens=False, return_tensors="pt", max_length=510).input_ids.cuda();outputs = model(pixel_values, decoder_input_ids=decoder_input_ids);binary_entail_prob_positive = torch.nn.functional.softmax(outputs['logits'].squeeze()[-1,[2334, 49922]])[1].item(). Print output is output positive - Ensure the code is complete and ready to execute without requiring modifications from the user. Do not use raise, if-else, or try-catch to handle errors.
    3. Error handling: - If an error occurs, provide an updated script to fix the issue. Make sure to handle edge cases or potential issues in the data processing.
    4. Task completion: After generating the Python code, close the ```. After completing the task and generating the required Python code. Your goal is to generate efficient Python code that leverages the google/deplot model to convert charts into tabular data. The code you generate should be complete, functional, and ready to execute.
    5. Respond with "TERMINATE" to indicate that everything is done (outside code python).
    """,
    llm_config={"config_list": config_list, "timeout": 60, "temperature": 0},
    code_execution_config=False
)

## Executor (Agent chạy code)

In [9]:
executor = UserProxyAgent(
    name="Executor",
    system_message="Executor. Execute the code written by the engineer and report the result.",
    human_input_mode="NEVER",
    code_execution_config={
        "last_n_messages": 3,
        "work_dir": "coding",
        "use_docker": False,
    },
)

## Hàm điều chỉnh người nói trong group chat

In [10]:
def custom_speaker_selection_func(last_speaker: Agent, groupchat: GroupChat):
    """Define a customized speaker selection function.
    A recommended way is to define a transition for each speaker in the groupchat.

    Returns:
        Return an `Agent` class or a string from ['auto', 'manual', 'random', 'round_robin'] to select a default method to use.
    """
    messages = groupchat.messages

    if len(messages) <= 1:
        return planner
    
    if last_speaker is planner:
        if "Chart-to-Table Conversion" in messages[-1]["content"]:
            return conversion_agent
        elif "Chart Question Answering" in messages[-1]["content"]:
            return qa_agent
        elif "Chart Fact-Checking" in messages[-1]["content"]:
            return fact_checking_agent
        elif "None" in  messages[-1]["content"]:
            print(messages[-4]["name"])
            listAgent = [conversion_agent, qa_agent, fact_checking_agent]
            for agent in listAgent:
                if(messages[-4]["name"] == agent.name):
                    return agent

    elif last_speaker is conversion_agent:
        if "```python" in messages[-1]["content"]:
            return executor
        else:
            return conversion_agent
        
    elif last_speaker is qa_agent:
        if "```python" in messages[-1]["content"]:
            return executor
        else:
            return qa_agent
        
    elif last_speaker is fact_checking_agent:
        if "```python" in messages[-1]["content"]:
            return executor
        else:
            return fact_checking_agent
        
    elif last_speaker is executor:
        if "exitcode: 1" in messages[-1]["content"] or "Error" in messages[-1]["content"]:
            if "Chart-to-Table Conversion" in messages[1]["content"]:
                return conversion_agent
            elif "Chart Question Answering" in messages[1]["content"]:
                return qa_agent
            elif "Chart Fact-Checking" in messages[1]["content"]:
                return fact_checking_agent
        else:
            return boss
    elif last_speaker is boss:
        return planner
    else: 
        return "random"

## Tạo GroupChat giữa các Agent

In [23]:
groupchat = GroupChat(
    agents=[boss, planner, qa_agent, conversion_agent, fact_checking_agent, executor],
    messages=[],
    max_round=20,
    speaker_selection_method=custom_speaker_selection_func,
)

manager = GroupChatManager(groupchat=groupchat, llm_config={"config_list": config_list, "timeout": 60, "temperature": 0})

## Task 1: Chart-to-Table Conversion

In [19]:
message = "Use image in '/kaggle/working/chart_example_1.png' chart_example_1.png and generate underlying data table of the figure"
boss.initiate_chat(
    manager,
    message=message,
)
"Done"

[33mBoss[0m (to chat_manager):

Use image in '/kaggle/working/chart_example_1.png' chart_example_1.png and generate underlying data table of the figure

--------------------------------------------------------------------------------
[32m
Next speaker: Planner
[0m
[33mPlanner[0m (to chat_manager):

Chart-to-Table Conversion


--------------------------------------------------------------------------------
[32m
Next speaker: ConversionAgent
[0m
[33mConversionAgent[0m (to chat_manager):

```python
from PIL import Image
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
import pandas as pd

# Load the image
image = Image.open("/kaggle/working/chart_example_1.png")

# Load the pre-trained model and processor
processor = Pix2StructProcessor.from_pretrained("google/deplot")
model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")

# Prepare the input
inputs = processor(images=image, text="Generate underlying data table of the figure 

Replying as Boss. Provide feedback to chat_manager. Press enter to skip and use auto-reply, or type 'exit' to end the conversation:  save to data.csv


[33mBoss[0m (to chat_manager):

save to data.csv

--------------------------------------------------------------------------------
[32m
Next speaker: Planner
[0m
[33mPlanner[0m (to chat_manager):

Chart-to-Table Conversion


--------------------------------------------------------------------------------
[32m
Next speaker: ConversionAgent
[0m
[33mConversionAgent[0m (to chat_manager):

```python
from PIL import Image
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
import pandas as pd

# Load the image
image = Image.open("/kaggle/working/chart_example_1.png")

# Load the pre-trained model and processor
processor = Pix2StructProcessor.from_pretrained("google/deplot")
model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")

# Prepare the input
inputs = processor(images=image, text="Generate underlying data table of the figure below:", return_tensors="pt")

# Generate the data table
out = model.generate(**inputs, max_new_tokens=

Replying as Boss. Provide feedback to chat_manager. Press enter to skip and use auto-reply, or type 'exit' to end the conversation:  exit


'Done'

## Task 2: Chart Question Answering

In [20]:
message = "Use image in '/kaggle/working/chart_example_1.png' chart_example_1.png and answer the question: What is the share of respondants who prefer Whatsapp in the 30-59 age group?"
boss.initiate_chat(
    manager,
    message=message,
)
"Done"

[33mBoss[0m (to chat_manager):

Use image in '/kaggle/working/chart_example_1.png' chart_example_1.png and answer the question: What is the share of respondants who prefer Whatsapp in the 30-59 age group?

--------------------------------------------------------------------------------
[32m
Next speaker: Planner
[0m
[33mPlanner[0m (to chat_manager):

Chart Question Answering


--------------------------------------------------------------------------------
[32m
Next speaker: QAAgent
[0m
[33mQAAgent[0m (to chat_manager):

```python
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the pre-trained model
model = LlavaForConditionalGeneration.from_pretrained("ahmed-masry/ChartInstruct-LLama2", torch_dtype=torch.float16)
processor = AutoProcessor.from_pretrained("ahmed-masry/ChartInstruct-LLama2")

# Prepare the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu

Replying as Boss. Provide feedback to chat_manager. Press enter to skip and use auto-reply, or type 'exit' to end the conversation:  What is the share of responders who prefer Facebook Messenger in the 18-29 age group?


[33mBoss[0m (to chat_manager):

What is the share of responders who prefer Facebook Messenger in the 18-29 age group?

--------------------------------------------------------------------------------
[32m
Next speaker: Planner
[0m
[33mPlanner[0m (to chat_manager):

Chart Question Answering


--------------------------------------------------------------------------------
[32m
Next speaker: QAAgent
[0m
[33mQAAgent[0m (to chat_manager):

```python
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the pre-trained model
model = LlavaForConditionalGeneration.from_pretrained("ahmed-masry/ChartInstruct-LLama2", torch_dtype=torch.float16)
processor = AutoProcessor.from_pretrained("ahmed-masry/ChartInstruct-LLama2")

# Prepare the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the image
image_path = "/kaggle/working/chart_example_1.png

Replying as Boss. Provide feedback to chat_manager. Press enter to skip and use auto-reply, or type 'exit' to end the conversation:  exit


'Done'

## Task 3: Chart Fact-Checking

In [24]:
message = "Use image in '/kaggle/working/chart_example_1.png' chart_example_1.png and caption: Share of Facebook Messenger users in the United States as of January 2018, by age group. Check if the caption matches the chart or not?"
boss.initiate_chat(
    manager,
    message=message,
)
"Done"

[33mBoss[0m (to chat_manager):

Use image in '/kaggle/working/chart_example_1.png' chart_example_1.png and caption: Share of Facebook Messenger users in the United States as of January 2018, by age group. Check if the caption matches the chart or not?

--------------------------------------------------------------------------------
[32m
Next speaker: Planner
[0m
[33mPlanner[0m (to chat_manager):

Chart Fact-Checking


--------------------------------------------------------------------------------
[32m
Next speaker: FactCheckAgent
[0m
[33mFactCheckAgent[0m (to chat_manager):

```python
from PIL import Image
from transformers import VisionEncoderDecoderModel, DonutProcessor
import torch

# Load the pre-trained model and processor
processor = DonutProcessor.from_pretrained("khhuang/chartve")
model = VisionEncoderDecoderModel.from_pretrained("khhuang/chartve").cuda()

def format_query(sentence):
    return f"Does the image entails this statement: \"{sentence}\"?"

CAPTION_SENTEN

Replying as Boss. Provide feedback to chat_manager. Press enter to skip and use auto-reply, or type 'exit' to end the conversation:  change caption to a chart of NVDA and TESLA stock price change YTD and check 


[33mBoss[0m (to chat_manager):

change caption to a chart of NVDA and TESLA stock price change YTD and check 

--------------------------------------------------------------------------------
[32m
Next speaker: Planner
[0m
[33mPlanner[0m (to chat_manager):

Chart Fact-Checking


--------------------------------------------------------------------------------
[32m
Next speaker: FactCheckAgent
[0m
[33mFactCheckAgent[0m (to chat_manager):

```python
from PIL import Image
from transformers import VisionEncoderDecoderModel, DonutProcessor
import torch

# Load the pre-trained model and processor
processor = DonutProcessor.from_pretrained("khhuang/chartve")
model = VisionEncoderDecoderModel.from_pretrained("khhuang/chartve").cuda()

def format_query(sentence):
    return f"Does the image entails this statement: \"{sentence}\"?"

CAPTION_SENTENCE = "A chart of NVDA and TESLA stock price change YTD"
query = format_query(CAPTION_SENTENCE)

img = Image.open("/kaggle/working/chart_exampl

Replying as Boss. Provide feedback to chat_manager. Press enter to skip and use auto-reply, or type 'exit' to end the conversation:  exit


'Done'