In [None]:
import pandas as pd

df = pd.read_excel('SIB-200 languages - ACL.xlsx')
df

In [None]:
import pandas as pd

model_params = {
    'xglm-564M': 564,
    'xglm-1.7B': 1700,
    'xglm-2.9B': 2900,
    'xglm-7.5B': 7500,
    'bloom-560M': 560,
    'bloom-1b1': 1100,
    'bloom-1b7': 1700,
    'bloom-3b': 3000,
    'bloom-7b1': 7100,
    'bloomz-560M': 560,
    'bloomz-1b1': 1100,
    'bloomz-1b7': 1700,
    'bloomz-3b': 3000,
    'bloomz-7b1': 7100
}

# Function to extract model name from the 'Model' column
def extract_model_name(model_string):
    return model_string.split()[1]  # Assuming model name is always the second part

# Function to get number of parameters for a given model name
def get_params(model_name):
    return model_params.get(model_name, 0)  # Return 0 if model name not found in dictionary

# Select relevant columns for analysis
columns_to_plot = ['Language Name', 'Language Family', 'Script (ISO 15924)', 'Resource Level', 'Population', 'Language Vitality', 'Digital Language Support', 
                   'Bloom Train Data Percentage', 'BLOOMZ Finetune Data', 'XGLM Train Percentage'] + list(df.columns[15:])

# Melt the DataFrame to make it suitable for plotting
melted_df = pd.melt(df, id_vars=columns_to_plot[:10], value_vars=columns_to_plot[9:],
                    var_name='Model', value_name='F1 Score')
melted_df['Num_Params'] = melted_df['Model'].apply(extract_model_name).apply(get_params)
melted_df

In [None]:
import plotly.io as pio
pio.renderers.default = 'vscode'

In [None]:
from functools import reduce
import operator
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

def filter_df(melted_df: pd.DataFrame, filter_column: str = None, value: str = None, only_bloom: bool = False, only_bloomz: bool = False, only_xglm : bool = False) -> pd.DataFrame:
    conditions = []  # Initialize an empty list to store conditions
    if filter_column is not None and value is not None:
        conditions.append(melted_df[filter_column] == value)
    if only_bloom:
        conditions.append(melted_df['Bloom Train Data Percentage'] > 0)
    if only_bloomz:
        conditions.append(melted_df['BLOOMZ Finetune Data'] > 0)
    if only_xglm:
        conditions.append(melted_df['XGLM Train Percentage'] > 0)

    # If no conditions were added, return the original DataFrame
    if not conditions:
        return melted_df

    # Use reduce with the logical AND operator to combine conditions
    combined_conditions = reduce(operator.and_, conditions)

    return melted_df[combined_conditions]

def generate_box(melted_df, filter_column, value, only_bloomz, only_bloom, only_xglm):
    filtered_df = filter_df(melted_df, filter_column, value, only_bloom=only_bloom, only_bloomz=only_bloomz, only_xglm=only_gxlm)
    unique_models = ['Bloom', 'Bloomz', 'XGLM']

    # Create subplots
    fig = make_subplots(rows=1, cols=len(unique_models), shared_yaxes=True,
                        subplot_titles=unique_models)

    # Iterate through each unique model
    for i, model_name in enumerate(unique_models):
        model_data = filtered_df[filtered_df['Model'].str.startswith("F1 " + model_name.lower() + '-')]
        fig.add_trace(go.Box(x=model_data['Model'], y=model_data['F1 Score'], name=model_name), row=1, col=i+1)

    # Update layout
    if filter is None:
        title = f'Box Plot for F1 Scores (Filter: None, Only Bloomz = {only_bloomz}, Only Bloom = {only_bloom}, Only XGLM = {only_xglm})'
    else:
        title = f'Box Plot for F1 Scores ({filter_column} = {value}, Only Bloomz = {only_bloomz}, Only Bloom = {only_bloom}, Only XGLM = {only_xglm})'
    fig.update_layout(title=title,
                      xaxis_title='Model',
                      yaxis_title='F1 Score',
                      showlegend=False,
                      height=600)

    # Show plot
    fig.show()

def generate_polar_plot(melted_df, filter_column, model_name, value, only_bloom, only_bloomz, only_xglm):
    filtered_df = filter_df(melted_df, filter_column, value, only_bloom, only_bloom, only_xglm)
    model_data = filtered_df[filtered_df['Model'].str.startswith("F1 " + model_name.lower() + '-')]
    fig = px.line_polar(model_data, r='F1 Score', theta='Language Name', line_close=True,
                        color='Model', title=f'Radar Chart for F1 Scores for {model_name} ({filter_column} = {value}, Only Bloomz = {only_bloomz}, Only Bloom = {only_bloom}, Only XGLM = {only_xglm})',
                        height=600, width=800)
    fig.show()
    
def genetate_line(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm):
    # Filter the dataframe
    filtered_df = filter_df(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)
    
    # Filter data for the specific model
    unique_models = ['Bloom', 'Bloomz', 'XGLM']
    
    # Create an empty figure
    fig = px.line()
    
    # Define a color map for each model
    color_map = {'Bloom': 'red', 'Bloomz': 'blue', 'XGLM': 'green'}
    
    for model_name in unique_models:
        model_data = filtered_df[filtered_df['Model'].str.startswith("F1 " + model_name.lower() + '-')]
    
        # Add model_name as a column
        model_data['model_name'] = model_name
    
        # Calculate mean F1 score and variance for each unique value of Num_Params and Model
        mean_data = model_data.groupby(['Num_Params', 'model_name'])['F1 Score'].mean().reset_index()
        variance_data = model_data.groupby(['Num_Params', 'model_name'])['F1 Score'].var().reset_index()
    
        # Add a trace for each model with a specified color and error bars representing variance
        fig.add_trace(px.line(mean_data, x='Num_Params', y='F1 Score', hover_name='model_name').update_traces(
            line=dict(color=color_map[model_name]), error_y=dict(array=variance_data['F1 Score'])).data[0])
    
    # Update layout
    fig.update_layout(title=f'Line Plot for Mean F1 Scores with Variance (Filter: {filter_column} = {value}, Only Bloomz = {only_bloomz}, Only Bloom = {only_bloom}, Only XGLM = {only_xglm})',
                      xaxis_title='Number of Parameters',
                      yaxis_title='Mean F1 Score',
                      showlegend=True,
                      height=600)
    fig.show()

In [None]:
only_bloom = False
only_bloomz = False
only_gxlm = False

# we can filter based on 'Language Family', 'Script (ISO 15924)', 'Resource Level', 'Population', 'Language Vitality', 'Digital Language Support'.
filter_column = None
value = None

if filter_column is None:
    generate_box(melted_df, filter_column, value, only_bloomz, only_bloom, only_gxlm)
elif value != None:
    generate_box(melted_df, filter_column, value, only_bloomz, only_bloom, only_gxlm)
else:
    for value in df[filter_column].unique():
        generate_box(melted_df, filter_column, value, only_bloomz, only_bloom, only_gxlm)

In [None]:
import warnings

warnings.filterwarnings('ignore')

only_bloomz = False
only_bloom = False
only_xglm = False

# we can filter based on 'Language Family', 'Script (ISO 15924)', 'Resource Level', 'Population', 'Language Vitality', 'Digital Language Support'.
model_name = 'XGLM'
filter_column = 'Resource Level'
value = None

if filter_column is None:
    generate_polar_plot(melted_df, filter_column, model_name, value, only_bloom, only_bloomz, only_xglm)
elif value != None:
    generate_polar_plot(melted_df, filter_column, model_name, value, only_bloom, only_bloomz, only_xglm)
else:
    for value in df[filter_column].unique():
        generate_polar_plot(melted_df, filter_column, model_name, value, only_bloom, only_bloomz, only_gxlm)


In [None]:
import plotly.express as px

only_bloomz = True
only_bloom = False
only_xglm = False

filter_column = None
value = None

if filter_column is None:
    genetate_line(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)
elif value != None:
    genetate_line(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)
else:
    for value in df[filter_column].unique():
        genetate_line(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)

In [None]:
import plotly.express as px

only_bloomz = False
only_bloom = False
only_xglm = False

filter_column = None
value = None

if filter_column is None:
    genetate_line(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)
elif value != None:
    genetate_line(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)
else:
    for value in df[filter_column].unique():
        genetate_line(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)

In [None]:
import plotly.figure_factory as ff
import pandas as pd

only_bloomz = False
only_bloom = True
only_xglm = False

filter_column = 'Resource Level'
value = 5

model_name = 'Bloom'

# Filter the dataframe
filtered_df = filter_df(melted_df, filter_column, value, only_bloom, only_bloomz, only_xglm)
model_data = filtered_df[filtered_df['Model'].str.startswith("F1 " + model_name.lower() + '-')]

# Calculate correlation matrix
correlation_matrix = model_data.corr()

# Create annotated heatmap
fig = ff.create_annotated_heatmap(z=correlation_matrix.values,
                                  x=correlation_matrix.columns.tolist(),
                                  y=correlation_matrix.index.tolist(),
                                  colorscale='Viridis',
                                  annotation_text=correlation_matrix.round(2).values,
                                  showscale=True)

# Set title
fig.update_layout(title_text=f'Correlation Matrix of {model_name}')

# Show the plot
fig.show()