This just parses our google sheets table into LaTeX.

In [153]:
table_file = '/Users/jnaiman/Downloads/tmp/JCDL2025/other_datasets.xlsx'

ref_path = '/Users/jnaiman/Downloads/tmp/JCDL2025/references.bib' # downloaded from JCDL overleaf

In [154]:
import pandas as pd

import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import homogenize_latex_encoding

import numpy as np

In [155]:
df_in = pd.read_excel(table_file)
# take out ones we are ignoring
df_in2 = df_in[~df_in['ignore (not vqa)'].isin(['yes', 'YES', 'Yes'])]
# drop cols
cols_ignore = ['ignore (not vqa)', 'notes', 'dje_notes_hide_for_paper!', 'JCDL Paper?', 'Inject Figs Paper?', 'Paper Link (see github)']
df = df_in2.drop(cols_ignore, axis=1)

In [156]:
df.head()

Unnamed: 0,Name,Dataset Link,Date,has types?,has individual aspects annotated?,Links to original Data which is potentially different from plot,paper name
0,FigureQA,https://www.microsoft.com/en-us/research/proje...,2018,"line plots, bar graphs, pie charts",yes,no,FigureQA: An Annotated Figure Dataset for Visu...
1,ChartLlama,https://huggingface.co/datasets/listen2you002/...,2023,"funnel, gantt, heatmap, scatter, box, candlestick",no,no,ChartLlama: A Multimodal LLM for Chart Underst...
2,ChartAssistant/ChartSFT,https://huggingface.co/datasets/FanqingM/Chart...,2024,"box, arrow, pie, horizontal bar, vertical bar,...",no,partial,ChartAssistant: A Universal Chart Multimodal L...
3,MMCA,https://huggingface.co/datasets/xywang1/MMC,2024,"line, bar, pie, scatter, heatmap, histogram, a...",no,no,MMC: Advancing Multimodal Chart Understanding ...
4,Chart2Text/Chart-to-text,https://github.com/JasonObeid/Chart2Text; http...,2020/2022,"Bar, Line, Pie, Table, scatter, area",partial,"yes, but no dists",Chart-to-Text: Generating Natural Language Des...


Match up each paper with citation(s):

In [157]:


# Specify the path to your BibTeX file

with open(ref_path, 'r', encoding='utf-8') as bibtex_file:
    parser = BibTexParser()
    # Optional: Apply customizations like handling LaTeX encoding
    parser.customization = homogenize_latex_encoding 
    bib_database = bibtexparser.load(bibtex_file, parser=parser)

bib_ids = []
for i in range(len(df)):
    df_title = df.iloc[i]['paper name']
    if ';' in df_title:
        #print('list!')
        df_titles = df_title.split(';')
        ids_this_paper = []
        for df_title in df_titles:
            for ientry, entry in enumerate(bib_database.entries):
                title = entry['title']
                title = title.replace('{','').replace('}','').rstrip().lstrip().replace('\\&','&')
                if title.lower() in df_title.lower().strip().rstrip().lstrip():
                    ids_this_paper.append(entry['ID'])
        if len(ids_this_paper) == len(df_titles): # all is well!
            bib_ids.append(",".join(ids_this_paper))
            #print(bib_ids[-1])
        elif len(ids_this_paper) < len(df_titles):
            print('list: not enough matches!')
            import sys; sys.exit()
        elif len(ids_this_paper) > len(df_titles):
            print('list: too many matches!')
            import sys; sys.exit()
        #import sys; sys.exit()
    else:
        # look through all bib entries
        ids_this_paper = []
        matches = []
        for ientry, entry in enumerate(bib_database.entries):
            title = entry['title']
            title = title.replace('{','').replace('}','').replace('\\&','&')
            if title.lower().strip() in df_title.lower().strip():
                ids_this_paper.append(entry['ID'])
                matches.append(title)
        if len(ids_this_paper) == 1: # all is well!
            bib_ids.append(ids_this_paper[0])
        elif len(ids_this_paper) == 0:
            print('single: no matches!')
            import sys; sys.exit()
        elif len(ids_this_paper) > 1:
            print('single: too many matches!')
            print(matches)
            import sys; sys.exit()
            
df['citations'] = bib_ids

In [158]:
df.head()

Unnamed: 0,Name,Dataset Link,Date,has types?,has individual aspects annotated?,Links to original Data which is potentially different from plot,paper name,citations
0,FigureQA,https://www.microsoft.com/en-us/research/proje...,2018,"line plots, bar graphs, pie charts",yes,no,FigureQA: An Annotated Figure Dataset for Visu...,kahou_figureqa_2018
1,ChartLlama,https://huggingface.co/datasets/listen2you002/...,2023,"funnel, gantt, heatmap, scatter, box, candlestick",no,no,ChartLlama: A Multimodal LLM for Chart Underst...,han_chartllama_2023
2,ChartAssistant/ChartSFT,https://huggingface.co/datasets/FanqingM/Chart...,2024,"box, arrow, pie, horizontal bar, vertical bar,...",no,partial,ChartAssistant: A Universal Chart Multimodal L...,meng_chartassistant_2024
3,MMCA,https://huggingface.co/datasets/xywang1/MMC,2024,"line, bar, pie, scatter, heatmap, histogram, a...",no,no,MMC: Advancing Multimodal Chart Understanding ...,liu_mmc_2024
4,Chart2Text/Chart-to-text,https://github.com/JasonObeid/Chart2Text; http...,2020/2022,"Bar, Line, Pie, Table, scatter, area",partial,"yes, but no dists",Chart-to-Text: Generating Natural Language Des...,"obeid_chart--text_2020,kantharaj_chart--text_2022"


In [159]:
# save this data
df.to_csv('./other_datasets.csv', index=False)

Ok, now make LaTex Table.

In [172]:
def replacements(tin):
    tc = tin
    tc = tc.replace('dot', 'scatter')
    for r in ['charts', 'plots', 'graphs', '_']:
        tc = tc.replace(r, '')
    for r in ['graph', 'chart', 'plot']:
        tc = tc.replace(r, '')
    for r in ['grouped', 'stacked']:
        tc = tc.replace(r,'')
    for r in ['horizontal/vertical', 'group', 'vertical', 'horizontal', 'stack', 'with', 'and', 'simple', 'data', 'one', 'column', 'numeric']:
        tc = tc.replace(r, '')
    tc = tc.lstrip().rstrip().replace('  ', ' ').replace('  ', ' ')
    return tc

In [176]:
chart_types = ['line', 'bar', 'scatter', 'pie', 'histogram', 'other']
types_df = []
for ct_df in df['has types?'].values:
    types = ct_df.split(',')
    types_out = []
    types_cleaned = []
    for t in types:
        # clean
        tc = t.lower()
        tc = replacements(tc)
        t2 = tc.split(' ')
        for ct in chart_types:
            if ct in t2:
                types_out.append(ct)
        types_cleaned.extend(t2)

    types_out = np.unique(types_out).tolist()
    types_cleaned = np.unique(types_cleaned).tolist()

    # now check

    if types_out != types_cleaned:
        #print(types_cleaned)
        types_out.append('other')
    types_out = np.unique(types_out).tolist()    
    types_df.append(types_out)

In [None]:
df_clean = df[['Name', 'Date']].copy()

##### Types of Chart #####
d = {}
for ct in chart_types:
    d[ct.capitalize()] = []

for tf in types_df: # all types for rows
    for k in list(d.keys()):
        if k.lower() in tf:
            d[k].append('$\\checkmark$')
        else:
            d[k].append('-')

for k,v in d.items():
    df_clean[k] = v

#### Bounding Boxes ####



df_clean.head()

Unnamed: 0,Name,Date,Line,Bar,Scatter,Pie,Histogram,Other
0,FigureQA,2018,$\checkmark$,$\checkmark$,-,$\checkmark$,-,-
1,ChartLlama,2023,-,-,$\checkmark$,-,-,$\checkmark$
2,ChartAssistant/ChartSFT,2024,$\checkmark$,$\checkmark$,$\checkmark$,$\checkmark$,-,$\checkmark$
3,MMCA,2024,$\checkmark$,$\checkmark$,$\checkmark$,$\checkmark$,$\checkmark$,$\checkmark$
4,Chart2Text/Chart-to-text,2020/2022,$\checkmark$,$\checkmark$,$\checkmark$,$\checkmark$,-,$\checkmark$
