In [None]:
import pandas as pd

df = pd.read_excel('SIB-200 languages - ACL.xlsx')
df

In [2]:
import pandas as pd

# Select relevant columns for analysis
columns_to_plot = ['Language Name', 'Language Family', 'Script (ISO 15924)', 'Resource Level', 'Population', 'Language Vitality', 'Digital Language Support', 
                   'Bloom Train Data Percentage', 'BLOOMZ Finetune Data', 
                   'F1 xglm-564M top_logprobs', 'F1 xglm-1.7B top_logprobs']

# Melt the DataFrame to make it suitable for plotting
melted_df = pd.melt(df, id_vars=columns_to_plot[:9], value_vars=columns_to_plot[9:],
                    var_name='Model', value_name='F1 Score')
melted_df

Unnamed: 0,Language Name,Language Family,Script (ISO 15924),Resource Level,Population,Language Vitality,Digital Language Support,Bloom Train Data Percentage,BLOOMZ Finetune Data,Model,F1 Score
0,Aceh,Austronesian,Arab,1,1 million to 1 billion,Endangered,Ascending,,,F1 xglm-564M top_logprobs,0.171161
1,Aceh,Austronesian,Latn,1,1 million to 1 billion,Endangered,Ascending,,,F1 xglm-564M top_logprobs,0.361237
2,Mesopotamian Spoken Arabic,Afro-Asiatic,Arab,,1 million to 1 billion,Institutional,Emerging,,,F1 xglm-564M top_logprobs,0.465124
3,"Arabic, Ta’izzi-Adeni Spoken",Afro-Asiatic,Arab,,1 million to 1 billion,Institutional,Emerging,,,F1 xglm-564M top_logprobs,0.502488
4,Tunisian Spoken Arabic,Afro-Asiatic,Arab,,1 million to 1 billion,Institutional,Emerging,,,F1 xglm-564M top_logprobs,0.507390
...,...,...,...,...,...,...,...,...,...,...,...
403,Yue Chinese,Sino-Tibetan,Hant,,1 million to 1 billion,Institutional,Vital,,,F1 xglm-1.7B top_logprobs,0.532076
404,Chinese,Sino-Tibetan,Hans,5,1 billion plus,Institutional,Thriving,16.200,4.51,F1 xglm-1.7B top_logprobs,0.530438
405,Chinese,Sino-Tibetan,Hant,5,1 billion plus,Institutional,Thriving,0.050,,F1 xglm-1.7B top_logprobs,0.592133
406,Standard Malay,Austronesian,Latn,3,,Institutional,Vital,,,F1 xglm-1.7B top_logprobs,0.591677


In [3]:
import plotly.io as pio
pio.renderers.default = 'vscode'

In [5]:
from functools import reduce
import operator

def filter_df(melted_df: pd.DataFrame, filter: str = None, value: str = None, only_bloom: bool = False, only_bloomz: bool = False) -> pd.DataFrame:
    conditions = []  # Initialize an empty list to store conditions
    if filter is not None and value is not None:
        conditions.append(melted_df[filter] == value)
    if only_bloom:
        conditions.append(melted_df['Bloom Train Data Percentage'] > 0)
    if only_bloomz:
        conditions.append(melted_df['BLOOMZ Finetune Data'] > 0)

    # If no conditions were added, return the original DataFrame
    if not conditions:
        return melted_df

    # Use reduce with the logical AND operator to combine conditions
    combined_conditions = reduce(operator.and_, conditions)

    return melted_df[combined_conditions]

In [9]:
import plotly.express as px
only_bloom = False
only_bloomz = False
filter = None
value = None

filtered_df = filter_df(melted_df, filter, value, only_bloom=only_bloom, only_bloomz=only_bloomz)
fig = px.box(filtered_df, x='Model', y='F1 Score',
             color='Model', title=f'Box Plot for F1 Scores For All Languages ({filter} = {value}, Only Bloomz = {only_bloomz}, Only Bloom = {only_bloom})',
             labels={'F1 Score': 'F1 Score', 'Model': 'Model'})

fig.show()

In [12]:
import plotly.express as px

only_bloomz = False
only_bloom = False

# we can filter based on 'Language Family', 'Script (ISO 15924)', 'Resource Level', 'Population', 'Language Vitality', 'Digital Language Support'.
filter = 'Language Family'

for value in df[filter].unique():
    filtered_df = filter_df(melted_df, filter, value, only_bloom, only_bloomz)
    fig = px.box(filtered_df, x='Model', y='F1 Score',
                 color='Model', title=f'Box Plot for F1 Scores by Model ({filter} = {value}, Only Bloomz = {only_bloomz}, Only Bloom = {only_bloom})',
                 labels={'F1 Score': 'F1 Score', 'Model': 'Model'})
    fig.show()
# Create a scatter plot using Plotly Express
# fig = px.scatter(melted_df, x='Language Name', y='F1 Score', color='Model',
#                  facet_col='Language Family', facet_col_wrap=3,
#                  labels={'F1 Score': 'F1 Score', 'Language Name': 'Language Name'},
#                  title='F1 Score by Language and Model')
# 
# # Customize layout if needed
# fig.update_layout(height=2000, width=1200)
# 
# # Show the plot
# fig.show()

In [28]:
import plotly.express as px
import warnings

warnings.filterwarnings('ignore')

# Radar chart for all models in the same plot using plotly
only_bloomz = True
only_bloom = False
filter = 'Resource Level'

for value in df[filter].unique():
    filtered_df = filter_df(melted_df, filter, value, only_bloom, only_bloomz)
    fig = px.line_polar(filtered_df, r='F1 Score', theta='Language Name', line_close=True,
                        color='Model', title=f'Radar Chart for F1 Scores by Language and Model ({value})',
                        height=600, width=800)
    fig.show()