## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import math
import pandas_bokeh
import plotly.express as px
import scipy

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
pandas_bokeh.output_notebook()

## Import Data

In [None]:
# Import the metrics calculated in 2.0_using_genbit_to_measure_bias.ipynb
product_metrics = pd.read_csv("data/genbit_metrics/product_level_metrics_v2.csv")
word_metrics = pd.read_csv("data/genbit_metrics/word_level_metrics_v2.csv")

## Preview Dataframes

In [None]:
product_metrics.head()

In [None]:
word_metrics.head()

## Top 5 Products/Models by Female %, Male % and Non-Binary %

In [None]:
product_metrics.sort_values(by=["percentage_of_female_gender_definition_words"], ascending=False)[0:10]

In [None]:
product_metrics.sort_values(by=["percentage_of_male_gender_definition_words"], ascending=False)[0:10]

In [None]:
product_metrics.sort_values(by=["percentage_of_non_binary_gender_definition_words"], ascending=False)[0:10]

In [None]:
product_metrics[(product_metrics['model']=='gpt-4-0613') & (product_metrics['genbit_score']>1.5)]

## Plot Overall Statistics by Model

### Distribution of Genbit Scores

In [None]:
fig = px.box(product_metrics, x="model", y = "genbit_score", points="all", hover_data=["product"], 
             title="Distribution of Genbit Score by Model", category_orders={'model':['Bard - PaLM','gpt-3.5-turbo-0301','gpt-4-0613']}, 
             height=600, width=1000, color='model',color_discrete_sequence=["#CE0099","#8854FC","#00CEC3"])

fig.update_layout(font=dict(size=18))

fig.show()

### Female v Male Words

In [None]:
female_words = product_metrics.pivot(index="product",columns="model",values="percentage_of_female_gender_definition_words").sort_values(by=["Bard - PaLM"],ascending=False)
male_words = product_metrics.pivot(index="product",columns="model",values="percentage_of_male_gender_definition_words").sort_values(by=["Bard - PaLM"],ascending=False)
non_binary_words = product_metrics.pivot(index="product",columns="model",values="percentage_of_non_binary_gender_definition_words").sort_values(by=["Bard - PaLM"],ascending=False)

In [None]:
#Pandas_Bokeh requires a patch to function:
#https://github.com/PatrikHlobil/Pandas-Bokeh/issues/128#issuecomment-1535794247



In [None]:
import pandas
import pandas_bokeh

female_plot = female_words[0:10].sort_values(by=["gpt-3.5-turbo-0301"],ascending=True).plot_bokeh.barh(
                          y=["Bard - PaLM","gpt-3.5-turbo-0301","gpt-4-0613"],
                        xlabel="Percentage of Female Definition Words",ylabel="Product", 
                        title="Percentage of Female Words",
                        figsize=(500,500),
                        colormap = ["#00CEC3","#8854FC","#CE0099"],
                        legend = "bottom_right",
                        fontsize_label="10pt",
                        fontsize_ticks="10pt",
                        fontsize_title="12pt",
                        show_figure=False
                          )

In [None]:
male_plot = male_words[0:10].sort_values(by=["gpt-3.5-turbo-0301"],ascending=True).plot_bokeh.barh(
                          y=["Bard - PaLM","gpt-3.5-turbo-0301","gpt-4-0613"],
                        xlabel="Percentage of Male Definition Words",ylabel="Product", 
                        title="Percentage of Male Words",
                        figsize=(500,500),
                        colormap = ["#00CEC3","#8854FC","#CE0099"],
                        legend = "bottom_right",
                        fontsize_label="10pt",
                        fontsize_ticks="10pt",
                        fontsize_title="12pt",
                        show_figure=False
                          )

In [None]:
non_binary_plot = non_binary_words[0:10].sort_values(by=["Bard - PaLM"],ascending=True).plot_bokeh.barh(
                          y=["Bard - PaLM","gpt-3.5-turbo-0301","gpt-4-0613"],
                        xlabel="Percentage of Non-Binary Definition Words",ylabel="Product", 
                        title="Percentage of Non-Binary Words",
                        figsize=(500,500),
                        colormap = ["#00CEC3","#8854FC","#CE0099"],
                        legend = "bottom_right",
                        fontsize_label="10pt",
                        fontsize_ticks="10pt",
                        fontsize_title="12pt",
                        show_figure=False
                          )

In [None]:
pandas_bokeh.plot_grid([[female_plot,male_plot]])

In [None]:
pandas_bokeh.plot_grid([[male_plot,non_binary_plot]])