In [5]:
!pip install --upgrade kaleido

Collecting kaleido
  Using cached kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
Installing collected packages: kaleido
Successfully installed kaleido-0.2.1


In [1]:
import seaborn as sns
import pandas as pd
import plotly.express as px

# Systems

In [2]:
df_sys = pd.read_csv("../data/review_tables/sys-lang.tsv", sep="\t")

In [3]:
df_lang_cnt = df_sys.groupby("Name").count().reset_index()[["Name", "System"]].rename(columns={"System": "Count", "Name": "Language"}).sort_values("Count", ascending=False)

In [13]:
fig = px.bar(df_lang_cnt, x='Language', y='Count', text_auto=True, width=1000, height=500, color_discrete_sequence=["#8d9bb3"])

fig.update_layout(
    # title="Number of Systems per Language",
    font=dict(size=20, family="Times New Roman",),
    margin = dict(b=7, t=5, r=5, l=5),
)

fig.show()
fig.write_image("../data/img/systems-per-language.pdf")

In [None]:
df_sys.head()

Unnamed: 0,Language,System,Language Family,Language Branch,Language Subgroup,Writing System,Script/Alphabet,Name
0,en,QALL-ME,Indo-European,Germanic,North Sea Germanic,Alphabetical,Latin,English
1,en,N. Aggarwal,Indo-European,Germanic,North Sea Germanic,Alphabetical,Latin,English
2,en,QAKiS,Indo-European,Germanic,North Sea Germanic,Alphabetical,Latin,English
3,en,SWSNL,Indo-European,Germanic,North Sea Germanic,Alphabetical,Latin,English
4,en,UTQA,Indo-European,Germanic,North Sea Germanic,Alphabetical,Latin,English


In [2]:
fig = px.sunburst(df_sys, path=['Language Family', 'Language Branch', "Name"], width=1150, height=800
                  #color_continuous_midpoint=np.average(df['lifeExp'], weights=df['pop'])
                  , color_discrete_sequence=["#8d9bb3", "#b38d8d", "#8db392"]
                  )

fig.add_annotation(text="Turkic", xanchor="left", x=0.62, y=0.495, ax=220, ay=-300)
fig.add_annotation(text="Common Turkic", xanchor="left", x=0.645, y=0.49, ax=230, ay=-270)

fig.add_annotation(text="Sino-Tibetan", xanchor="left", x=0.6, y=0.425, ax=300, ay=70)
#fig.add_annotation(text="Sinitic", xanchor="left", x=0.69, y=0.472, ax=280, ay=-100)


fig.add_annotation(text="Afro-Asiatic", xanchor="left", x=0.62, y=0.463, ax=320, ay=45)
fig.add_annotation(text="Dravidian", xanchor="left", x=0.62, y=0.48, ax=320, ay=-75)
fig.add_annotation(text="Southern", xanchor="left", x=0.69, y=0.47, ax=300, ay=-55)
# fig.add_annotation(text="Semitic", xanchor="left", x=0.75, y=0.433, ax=240, ay=48)

fig.add_annotation(text="Graeco-Phrygian", xanchor="left", x=0.7, y=0.323, ax=190, ay=58)
fig.add_annotation(text="Armenian", xanchor="left", x=0.68, y=0.295, ax=190, ay=98)

fig.update_layout(
    # title="Distribution of language groups and branches supported by the KGQA systems",
    font=dict(size=20, family="Times New Roman"),
    uniformtext=dict(minsize=20, mode='hide'),
    legend_title_text='System Count',
    margin = dict(b=5, t=5),

)

fig.show()
fig.write_image("../data/img/language-families.pdf")

NameError: name 'df_sys' is not defined

# Benchmarks visualization 

In [37]:
import plotly.express as px
import pandas as pd

# Given meta-dataset
meta_dataset = [
    {"name": "QALD-3", "year": 2013, "numOfQuestions": 199, "numOfLanguages": 6, "textposition": "top center"},
    {"name": "QALD-6", "year": 2016, "numOfQuestions": 450, "numOfLanguages": 8, "textposition": "top center"},
    {"name": "QALD-7", "year": 2017, "numOfQuestions": 258, "numOfLanguages": 6, "textposition": "top center"},
    {"name": "QALD-8", "year": 2018, "numOfQuestions": 260, "numOfLanguages": 6, "textposition": "bottom center"},
    {"name": "QALD-9", "year": 2018, "numOfQuestions": 558, "numOfLanguages": 11, "textposition": "top center"},
    {"name": "QALD-9-plus", "year": 2022, "numOfQuestions": 558, "numOfLanguages": 9, "textposition": "top center"},
    {"name": "rewordQALD9", "year": 2022, "numOfQuestions": 558, "numOfLanguages": 2, "textposition": "bottom center"},
    {"name": "QALD10", "year": 2023, "numOfQuestions": 909, "numOfLanguages": 4, "textposition": "top center"},
    {"name": "EventQA", "year": 2020, "numOfQuestions": 1000, "numOfLanguages": 3, "textposition": "bottom center"},
    {"name": "RuBQ 1.0", "year": 2020, "numOfQuestions": 1500, "numOfLanguages": 2, "textposition": "top center"},
    {"name": "RuBQ 2.0", "year": 2021, "numOfQuestions": 2910, "numOfLanguages": 2, "textposition": "top center"},
    {"name": "MCWQ", "year": 2022, "numOfQuestions": 124187, "numOfLanguages": 4, "textposition": "top center"},
    {"name": "Mintaka", "year": 2022, "numOfQuestions": 20000, "numOfLanguages": 9, "textposition": "top center"},
    {"name": "MLPQ", "year": 2023, "numOfQuestions": 300000, "numOfLanguages": 3, "textposition": "top center"}
]

# Convert meta-dataset to a pandas DataFrame
df = pd.DataFrame(meta_dataset)

# Create the bubble chart
fig = px.scatter(df, x="year", y="numOfQuestions", text="name", size="numOfLanguages", color="numOfLanguages", hover_name="name",
                 labels={"year": "Publication year", "numOfLanguages": "# of Languages", "numOfQuestions": "Number of Questions (log)"}, width=1450, height=600)


# fig.add_annotation(text="Graeco-Phrygian", xanchor="left", x=0.7, y=0.323, ax=190, ay=58)
# fig.add_annotation(text="Armenian", xanchor="left", x=0.68, y=0.295, ax=190, ay=98)

# Set x and y-axis as integers
fig.update_xaxes(nticks=len(df["year"].unique()))
fig.update_yaxes(nticks=len(df["numOfLanguages"].unique()))

fig.update_xaxes(tickmode = 'linear', tick0 = 2018, dtick = 1)  
fig.update_yaxes(tickmode = 'linear', tick0 = 2, dtick = 1)  
fig.update_layout(yaxis_type="log", font=dict(size=20, family="Times New Roman", color='black'))
# fig.update_traces(textposition="top center", textfont=dict(color="black"))
fig.update_traces(
    textposition=df["textposition"],
    textfont=dict(color="black"),
    mode='markers+text',
    textfont_size=18,
    texttemplate='%{text}'
)

fig.update_xaxes(title_font=dict(size=18), tickfont=dict(size=18))
fig.update_yaxes(title_font=dict(size=18), tickfont=dict(size=18))

# Show the bubble chart
fig.show()
fig.write_image("../data/img/benchmark-stats.pdf")

In [15]:
dict(name=bubble_name)

{'name': 'rewordQALD9'}

In [25]:
for trace in fig.select_traces():
    print(trace)

Scatter({
    'hovertemplate': ('<b>%{hovertext}</b><br><br>Pub' ... 'br>name=%{text}<extra></extra>'),
    'hovertext': array(['QALD-3', 'QALD-6', 'QALD-7', 'QALD-8', 'QALD-9', 'QALD-9-plus',
                        'rewordQALD9', 'QALD10', 'EventQA', 'RuBQ 1.0', 'RuBQ 2.0', 'MCWQ',
                        'Mintaka', 'MLPQ'], dtype=object),
    'legendgroup': '',
    'marker': {'color': array([ 6,  8,  6,  6, 11,  9,  2,  4,  3,  2,  2,  4,  9,  3]),
               'coloraxis': 'coloraxis',
               'size': array([ 6,  8,  6,  6, 11,  9,  2,  4,  3,  2,  2,  4,  9,  3]),
               'sizemode': 'area',
               'sizeref': 0.0275,
               'symbol': 'circle'},
    'mode': 'markers+text',
    'name': '',
    'orientation': 'v',
    'showlegend': False,
    'text': array(['QALD-3', 'QALD-6', 'QALD-7', 'QALD-8', 'QALD-9', 'QALD-9-plus',
                   'rewordQALD9', 'QALD10', 'EventQA', 'RuBQ 1.0', 'RuBQ 2.0', 'MCWQ',
                   'Mintaka', 'MLPQ'], dtype=o