### Example of biased causal reasoning: from name to career choice

In [None]:
# from colabtools import sheets
import pandas as pd
from IPython import display

### Read data

In [None]:
df_1 = pd.read_csv('All_answers_1.csv')
df_2 = pd.read_csv('All_answers_2.csv')
df_3 = pd.read_csv('All_answers_3.csv')

In [None]:
df = df_1.copy()

In [None]:
sensitive_attributes = list(df_1['sensitive_attribute'].unique()[:8])
causal_graph_labels = ['b', 'n', 'nr', 'r', 'mr', 'mb', 'm']
stats_dict = {}

In [None]:
#@title Statistics for 8 sensitive categories
def get_stats(df):
  stats_dict = {}
  for model_name in ['gemma','gemini','llama_70B','claude']:
    for s in sensitive_attributes:
      if s not in stats_dict.keys():
        stats_dict[s] = {}

      stats_dict[s][f'biased_{model_name}'] = {}
      stats_dict[s][f'risky_{model_name}'] = {}

      biased_correct_df = df[(df['sensitive_attribute'] == s) & (df['category'] == 'biased') & (df[f"{model_name}_answer_label"] == '1')]
      stats_dict[s][f'biased_{model_name}']['correct_ratio'] = len(biased_correct_df)/len(df[(df['sensitive_attribute'] == s) & (df['category'] == 'biased')])
      stats_dict[s][f'biased_{model_name}']['correct_distribution'] = biased_correct_df[f"causal_label_{model_name}"].value_counts().to_dict()

      risky_correct_df = df[(df['sensitive_attribute'] == s) & (df['category'] == 'risky') & (df[f"{model_name}_answer_label"] == '1')]
      stats_dict[s][f'risky_{model_name}']['correct_ratio'] = len(risky_correct_df)/len(df[(df['sensitive_attribute'] == s) & (df['category'] == 'risky')])
      stats_dict[s][f'risky_{model_name}']['correct_distribution'] = risky_correct_df[f"causal_label_{model_name}"].value_counts().to_dict()

      biased_wrong_df = df[(df['sensitive_attribute'] == s) & (df['category'] == 'biased') & (df[f"{model_name}_answer_label"] == '0')]
      stats_dict[s][f'biased_{model_name}']['wrong_ratio'] = len(biased_wrong_df)/len(df[(df['sensitive_attribute'] == s) & (df['category'] == 'biased')])
      stats_dict[s][f'biased_{model_name}']['wrong_distribution'] = biased_wrong_df[f"causal_label_{model_name}"].value_counts().to_dict()

      risky_wrong_df = df[(df['sensitive_attribute'] == s) & (df['category'] == 'risky') & (df[f"{model_name}_answer_label"] == '0')]
      stats_dict[s][f'risky_{model_name}']['wrong_ratio'] = len(risky_wrong_df)/len(df[(df['sensitive_attribute'] == s) & (df['category'] == 'risky')])
      stats_dict[s][f'risky_{model_name}']['wrong_distribution'] = risky_wrong_df[f"causal_label_{model_name}"].value_counts().to_dict()
  return stats_dict

stats_dict_1 = get_stats(df_1)
stats_dict_2 = get_stats(df_2)
stats_dict_3 = get_stats(df_3)

In [None]:
def aggregate_dictionaries(dicts):
    """Aggregates three dictionaries with the given structure.

    Args:
        dicts: A list of three dictionaries, each with the same structure.

    Returns:
        A new dictionary with aggregated correct ratios and distributions.
    """
    if len(dicts) != 3:
        raise ValueError("Exactly three dictionaries are required.")

    aggregated = {}
    for category in dicts[0].keys():
        aggregated[category] = {}
        for model_type in dicts[0][category].keys():
            aggregated[category][model_type] = {
                'correct_ratio': 0.0,
                'correct_distribution': {},
                'wrong_ratio': 0.0,
                'wrong_distribution': {}
            }

            correct_ratios = []
            wrong_ratios = []
            correct_distributions = []
            wrong_distributions = []

            for d in dicts:
                correct_ratios.append(d[category][model_type]['correct_ratio'])
                wrong_ratios.append(d[category][model_type]['wrong_ratio'])
                correct_distributions.append(d[category][model_type]['correct_distribution'])
                wrong_distributions.append(d[category][model_type]['wrong_distribution'])

            # Calculate average correct and wrong ratios
            aggregated[category][model_type]['correct_ratio'] = sum(correct_ratios) / 3.0
            aggregated[category][model_type]['wrong_ratio'] = sum(wrong_ratios) / 3.0

            # Merge correct distributions
            merged_correct_dist = {}
            for dist in correct_distributions:
                for key, value in dist.items():
                    merged_correct_dist[key] = merged_correct_dist.get(key, 0) + value
            aggregated[category][model_type]['correct_distribution'] = merged_correct_dist

            # Merge wrong distributions
            merged_wrong_dist = {}
            for dist in wrong_distributions:
                for key, value in dist.items():
                    merged_wrong_dist[key] = merged_wrong_dist.get(key, 0) + value
            aggregated[category][model_type]['wrong_distribution'] = merged_wrong_dist

    return aggregated

In [None]:
stats_dict = aggregate_dictionaries([stats_dict_1, stats_dict_2, stats_dict_3])

In [None]:
for s in sensitive_attributes:
  # for m in ['llama_70B']:
  #   print(f"-----------------------------biased {s} {m}-------------------------------------")
  #   print(stats_dict[s][f'biased_{m}'])
  #   print("\n")
  for m in ['gemini']:
    print(f"-----------------------------risky {s} {m}-------------------------------------")
    print(stats_dict_3[s][f'risky_{m}'])
    print("\n")

In [None]:
#@title Statistics for 3 mistaking categories
df_nj = df.iloc[1200:1200+196]
df_nm = df.iloc[1200+196:1200+196*2]
df_np = df.iloc[1200+196*2:]
model_name = "llama_70B"
nj_correct_dict = df_nj[df_nj[f'{model_name}_answer_label'] == '1'].value_counts(f'causal_label_{model_name}').to_dict()
nm_correct_dict = df_nm[df_nm[f'{model_name}_answer_label'] == '1'].value_counts(f'causal_label_{model_name}').to_dict()
np_correct_dict = df_np[df_np[f'{model_name}_answer_label'] == '1'].value_counts(f'causal_label_{model_name}').to_dict()
nj_wrong_dict = df_nj[df_nj[f'{model_name}_answer_label'] == '0'].value_counts(f'causal_label_{model_name}').to_dict()
nm_wrong_dict = df_nm[df_nm[f'{model_name}_answer_label'] == '0'].value_counts(f'causal_label_{model_name}').to_dict()
np_wrong_dict = df_np[df_np[f'{model_name}_answer_label'] == '0'].value_counts(f'causal_label_{model_name}').to_dict()

In [None]:
print(nj_correct_dict)
print(nm_correct_dict)
print(np_correct_dict)
print(nj_wrong_dict)
print(nm_wrong_dict)
print(np_wrong_dict)

In [None]:
#@title Statistics for noDI
noDI_stats = {}

In [None]:
model_name = "llama_70B"
for s in sensitive_attributes:
  if s not in noDI_stats.keys():
    noDI_stats[s] = {}
  biased_correct_df = df[(df['sensitive_attribute'] == s) & (df['category'] == 'biased') & (df[f"noDI_{model_name}_answer_label"] == '1')]
  correct_ratio = len(biased_correct_df)/len(df[(df['sensitive_attribute'] == s) & (df['category'] == 'biased')])
  noDI_stats[s][f'biased_{model_name}'] = correct_ratio
  print(f"model: {model_name}; sensitive attribute: {s}; biased correct ratio: {correct_ratio}")

  risky_correct_df = df[(df['sensitive_attribute'] == s) & (df['category'] == 'risky') & (df[f"noDI_{model_name}_answer_label"] == '1')]
  correct_ratio = len(risky_correct_df)/len(df[(df['sensitive_attribute'] == s) & (df['category'] == 'risky')])
  noDI_stats[s][f'risky_{model_name}'] = correct_ratio
  print(f"model: {model_name}; sensitive attribute: {s}; risky correct ratio: {correct_ratio}")

In [None]:
#@title noDI Statistics for 3 mistaking categories
df_nj = df.iloc[1200:1200+196]
df_nm = df.iloc[1200+196:1200+196*2]
df_np = df.iloc[1200+196*2:]
model_name = "llama_70B"
nj_correct_ratio = df_nj[df_nj[f'noDI_{model_name}_answer_label'] == '1'].shape[0] / df_nj.shape[0]
nm_correct_ratio = df_nm[df_nm[f'noDI_{model_name}_answer_label'] == '1'].shape[0] / df_nm.shape[0]
np_correct_ratio = df_np[df_np[f'noDI_{model_name}_answer_label'] == '1'].shape[0] / df_np.shape[0]
print(f"model: {model_name}; name_job correct ratio: {nj_correct_ratio}")
print(f"model: {model_name}; name_major correct ratio: {nm_correct_ratio}")
print(f"model: {model_name}; name_personality correct ratio: {np_correct_ratio}")

### Visualization of biased question accuracy

In [None]:
biased_correct_gemma = [stats_dict[s]['biased_gemma']['correct_ratio'] for s in sensitive_attributes]
risky_correct_gemma = [stats_dict[s]['risky_gemma']['correct_ratio'] for s in sensitive_attributes]
biased_correct_gemini = [stats_dict[s]['biased_gemini']['correct_ratio'] for s in sensitive_attributes]
risky_correct_gemini = [stats_dict[s]['risky_gemini']['correct_ratio'] for s in sensitive_attributes]
biased_correct_llama_70B = [stats_dict[s]['biased_llama_70B']['correct_ratio'] for s in sensitive_attributes]
risky_correct_llama_70B = [stats_dict[s]['risky_llama_70B']['correct_ratio'] for s in sensitive_attributes]
biased_correct_claude = [stats_dict[s]['biased_claude']['correct_ratio'] for s in sensitive_attributes]
risky_correct_claude = [stats_dict[s]['risky_claude']['correct_ratio'] for s in sensitive_attributes]

# plot them as bar chat
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots(figsize=(11, 4))
index = np.arange(len(sensitive_attributes))
bar_width = 0.15
opacity = 0.8
ss = sensitive_attributes.copy()
ss[5] = "appearance"
rects1 = ax.bar(index, biased_correct_gemma, bar_width,
                alpha=opacity, color='b',
                label='Gemma-27B-IT')
rects2 = ax.bar(index + bar_width, biased_correct_llama_70B, bar_width,
                alpha=opacity, color='c',
                label='Llama-3.1-70B-Instruct')
# rects3 = ax.bar(index + 2*bar_width, biased_correct_llama_405B, bar_width,
#                 alpha=opacity, color='r',
#                 label='llama_405B')
rects4 = ax.bar(index + 2*bar_width, biased_correct_gemini, bar_width,
                alpha=opacity, color='r',
                label='gemini-1.5-pro-002')
rects5 = ax.bar(index + 3*bar_width, biased_correct_claude, bar_width,
                alpha=opacity, color='m',
                label='claude-3-5-sonnet-v2-20241022')

plt.ylabel('Accuracy',fontsize=12)
plt.xticks(index + bar_width / 2, ss, fontsize=12)
plt.yticks(fontsize=12)
plt.legend(fontsize=12)

plt.tight_layout()
plt.savefig('biased_acc_categories.pdf')
plt.show()

In [None]:
# Display a table using above data
import pandas as pd
biased_acc_df = pd.DataFrame({'sensitive_attribute': sensitive_attributes, 'Gemma-27B-IT': biased_correct_gemma, 'Llama-3.1-70B-Instruct': biased_correct_llama_70B, 'gemini-1.5-pro-002': biased_correct_gemini, 'claude-3-5-sonnet-v2-20241022': biased_correct_claude})
biased_acc_df

### Comparison of  correct ratio for each sensitive category on biased / risky question (DI vs noDI)

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(30, 10))
models = ['gemma', 'llama_70B', 'llama_405B', 'gemini', 'claude']
for i in range(len(sensitive_attributes)):
  index = np.arange(len(models))
  bar_width = 0.35
  opacity = 0.8
  ax[i//4, i%4].bar(index, [noDI_stats[sensitive_attributes[i]]['biased_' + m] for m in models], bar_width,
                  alpha=opacity, color='r', label = 'No causal graph')
  ax[i//4, i%4].bar(index + bar_width, [stats_dict[sensitive_attributes[i]]['biased_' + m]['correct_ratio'] for m in models], bar_width,
                  alpha=opacity, color='b', label = 'With causal graph')
  ax[i//4, i%4].legend()
  ax[i//4, i%4].set_xticks(index + bar_width / 2)
  ax[i//4, i%4].set_xticklabels(models, fontsize=12)
  ax[i//4, i%4].set_ylabel('Accuracy',fontsize=12)
  ax[i//4, i%4].set_yticks(np.arange(0, 1.1, 0.2))
  ax[i//4, i%4].set_title(sensitive_attributes[i], fontsize=12)
plt.show()


In [None]:
fig, ax = plt.subplots(2, 4, figsize=(30, 10))
models = ['gemma', 'llama_70B', 'llama_405B', 'gemini', 'claude']
for i in range(len(sensitive_attributes)):
  index = np.arange(len(models))
  bar_width = 0.35
  opacity = 0.8
  ax[i//4, i%4].bar(index, [noDI_stats[sensitive_attributes[i]]['risky_' + m] for m in models], bar_width,
                  alpha=opacity, color='r', label = 'No causal graph')
  ax[i//4, i%4].bar(index + bar_width, [stats_dict[sensitive_attributes[i]]['risky_' + m]['correct_ratio'] for m in models], bar_width,
                  alpha=opacity, color='b', label = 'With causal graph')
  ax[i//4, i%4].legend()
  ax[i//4, i%4].set_xticks(index + bar_width / 2)
  ax[i//4, i%4].set_xticklabels(models, fontsize=12)
  ax[i//4, i%4].set_ylabel('Accuracy',fontsize=12)
  ax[i//4, i%4].set_yticks(np.arange(0, 1.1, 0.2))
  ax[i//4, i%4].set_title(sensitive_attributes[i], fontsize=12)
plt.show()

In [None]:
overall_biased_acc_DI = []
overall_risky_acc_DI = []
overall_biased_acc_noDI = []
overall_risky_acc_noDI = []
models = ['gemma', 'llama_70B', 'gemini', 'claude']
for m in models:
  overall_biased_acc_DI.append(sum([stats_dict[s]['biased_' + m]['correct_ratio'] for s in sensitive_attributes]) / len(sensitive_attributes))
  overall_risky_acc_DI.append(sum([stats_dict[s]['risky_' + m]['correct_ratio'] for s in sensitive_attributes]) / len(sensitive_attributes))
  overall_biased_acc_noDI.append(sum([noDI_stats[s]['biased_' + m] for s in sensitive_attributes]) / len(sensitive_attributes))
  overall_risky_acc_noDI.append(sum([noDI_stats[s]['risky_' + m] for s in sensitive_attributes]) / len(sensitive_attributes))
overall_biased_acc_df = pd.DataFrame({'model': models, 'Accuracy (with causal graph)': overall_biased_acc_DI, 'Accuracy (no causal graph)': overall_biased_acc_noDI})
overall_risky_acc_df = pd.DataFrame({'model': models, 'Accuracy (with causal graph)': overall_risky_acc_DI, 'Accuracy (no causal graph)': overall_risky_acc_noDI})

In [None]:
overall_biased_acc_df

In [None]:
overall_risky_acc_df

### Visualization of risky question accuracy

In [None]:
risky_acc_df = pd.DataFrame({'sensitive_attribute': ss, 'gemma': risky_correct_gemma, 'llama_70B': risky_correct_llama_70B, 'llama_405B': risky_correct_llama_405B, 'gemini': risky_correct_gemini, 'claude': risky_correct_claude})
risky_acc_df

In [None]:
fig, ax = plt.subplots(figsize=(11, 4))
index = np.arange(len(sensitive_attributes))
bar_width = 0.15
opacity = 0.8
ss = sensitive_attributes.copy()
ss[5] = "appearance"
rects1 = ax.bar(index, risky_correct_gemma, bar_width,
                alpha=opacity, color='b',
                label='Gemma-27B-IT')
rects2 = ax.bar(index + bar_width, risky_correct_llama_70B, bar_width,
                alpha=opacity, color='c',
                label='Llama-3.1-70B-Instruct')
rects4 = ax.bar(index + 2*bar_width, risky_correct_gemini, bar_width,
                alpha=opacity, color='r',
                label='Gemini-1.5-pro-002')
rects5 = ax.bar(index + 3*bar_width, risky_correct_claude, bar_width,
                alpha=opacity, color='m',
                label='claude-3.5-sonnet-v2-20241022')

plt.ylabel('Accuracy',fontsize=12)
plt.xticks(index + bar_width / 2, ss, fontsize=12)
plt.yticks(fontsize=12)
plt.legend(fontsize=12)

plt.tight_layout()
plt.savefig('risky_acc.pdf')
fileedit.download_file('risky_acc.pdf')

### Visualization and comparison of mistake_bias questions

In [None]:
import numpy as np
models = ['gemma', 'llama_70B', 'gemini', 'claude']
correct_ratios = {model:[0,0,0] for model in models}
correct_ratios_noDI = {model:[0,0,0] for model in models}
for df in [df_1, df_2, df_3]:
  df_nj = df.iloc[1200:1200+196]
  df_nm = df.iloc[1200+196:1200+196*2]
  df_np = df.iloc[1200+196*2:]
  for m in models:
    i = 0
    for df_ in [df_nj, df_nm, df_np]:
      correct_ratios[m][i] += (np.round(df_[df_[f'{m}_answer_label'] == '1'].shape[0] / df_.shape[0], 3)) / 3
      correct_ratios_noDI[m][i] += (np.round(df_[df_[f'noDI_{m}_answer_label'] == '1'].shape[0] / df_.shape[0], 3)) / 3
      i += 1

indexes = np.arange(3)
names = ['name_job', 'name_major', 'name_personality']

# plot
import matplotlib.pyplot as plt
bar_width = 0.15
fig, ax = plt.subplots(figsize=(6, 4))
for i in range(len(models)):
  ax.bar(indexes+i*bar_width, correct_ratios[models[i]], label=models[i], width=bar_width)

ax.set_ylabel('Correct Ratio')
ax.set_xticks(indexes+bar_width/2)
ax.set_xticklabels(names)
ax.legend()
plt.show()


In [None]:
mistake_bias_acc_df = pd.DataFrame({'sensitive_attribute': ['job', 'major', 'personality'], 'gemma': correct_ratios['gemma'], 'llama_70B': correct_ratios['llama_70B'], 'gemini': correct_ratios['gemini'], 'claude': correct_ratios['claude']})
mistake_bias_acc_df

Unnamed: 0,sensitive_attribute,gemma,llama_70B,gemini,claude
0,job,0.098667,0.006667,0.005,0.0
1,major,0.193667,0.0,0.0,0.0
2,personality,0.149667,0.0,0.024,0.0


In [None]:
overall_mistake_bias_acc_DI = []
overall_mistake_bias_acc_noDI = []
for m in models:
  overall_mistake_bias_acc_DI.append(np.round(sum(correct_ratios[m]) / len(correct_ratios[m]),3))
  overall_mistake_bias_acc_noDI.append(np.round(sum(correct_ratios_noDI[m]) / len(correct_ratios_noDI[m]),3))
overall_mistake_bias_acc_df = pd.DataFrame({'model': models, 'Accuracy (with causal graph)': overall_mistake_bias_acc_DI, 'Accuracy (no causal graph)': overall_mistake_bias_acc_noDI})
overall_mistake_bias_acc_df

### Causal path distribution (biased)

In [None]:
df = pd.merge(df_1, df_2, on='question')
df = pd.merge(df, df_3, on='question')
color_mapping = {
    'n': '#98FB98',
    'nr': '#87CEEB',
    'b': 'red',
    'r': 'orange',
    'm': '#E6E6FA',
    'mb': '#BC8F8F',
    'mr': 'pink'
    # Add more mappings as needed
}

def custom_autopct(pct):
    return f'{pct:.1f}%' if pct > 1 else ''

models_1 = ['gemma', 'llama_70B', 'gemini', 'claude']
models_2 = ['Gemma-27B-IT', 'Llama-3.1-70B-Instruct', 'gemini-1.5-pro-002', 'claude-3-5-sonnet-v2-20241022']
biased_causal_path_dict = {m: {} for m in models}
for m in models:
  biased_causal_path_dict[m]['correct_dist'] = df[(df['category'] == 'biased') & (df[f'{m}_answer_label'] == '1')].value_counts(f'causal_label_{m}').to_dict()
  biased_causal_path_dict[m]['wrong_dist'] = df[(df['category'] == 'biased') & (df[f'{m}_answer_label'] == '0')].value_counts(f'causal_label_{m}').to_dict()


# pie chart
# 2*4 plots
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 4, figsize=(13, 5))
for i, m in enumerate(models_1):
    # Correct distribution
    correct_labels = list(biased_causal_path_dict[m]['correct_dist'].keys())
    correct_values = list(biased_causal_path_dict[m]['correct_dist'].values())
    correct_colors = [color_mapping.get(label, 'gray') for label in correct_labels] #Use gray if label is not in mapping
    axs[0, i].pie(correct_values, labels=correct_labels, autopct=custom_autopct, colors=correct_colors, textprops={'fontsize': 12})
    axs[0, i].set_title(f'{models_2[i]}_correct')

    # Wrong distribution
    wrong_labels = list(biased_causal_path_dict[m]['wrong_dist'].keys())
    wrong_values = list(biased_causal_path_dict[m]['wrong_dist'].values())
    wrong_colors = [color_mapping.get(label, 'gray') for label in wrong_labels] #Use gray if label is not in mapping
    axs[1, i].pie(wrong_values, autopct=custom_autopct, labels=wrong_labels, colors=wrong_colors, textprops={'fontsize': 12})
    axs[1, i].set_title(f'{models_2[i]}_wrong')
plt.tight_layout()
plt.savefig('biased_causal_path_distribution.pdf')
fileedit.download_file('biased_causal_path_distribution.pdf')
plt.show()


### Causal labeling distribution (risky)

In [None]:
df = pd.merge(df_1, df_2, on='question')
df = pd.merge(df, df_3, on='question')
color_mapping = {
    'n': '#98FB98',
    'nr': '#87CEEB',
    'b': 'red',
    'r': 'orange',
    'm': '#E6E6FA',
    'mb': '#BC8F8F',
    'mr': 'pink'
    # Add more mappings as needed
}

def custom_autopct(pct):
    return f'{pct:.1f}%' if pct > 1 else ''

models_1 = ['gemma', 'llama_70B', 'gemini', 'claude']
models_2 = ['Gemma-27B-IT', 'Llama-3.1-70B-Instruct', 'gemini-1.5-pro-002', 'claude-3-5-sonnet-v2-20241022']
biased_causal_path_dict = {m: {} for m in models}
for m in models:
  biased_causal_path_dict[m]['correct_dist'] = df[(df['category'] == 'risky') & (df[f'{m}_answer_label'] == '1')].value_counts(f'causal_label_{m}').to_dict()
  biased_causal_path_dict[m]['wrong_dist'] = df[(df['category'] == 'risky') & (df[f'{m}_answer_label'] == '0')].value_counts(f'causal_label_{m}').to_dict()


# pie chart
# 2*4 plots
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 4, figsize=(13, 5))
for i, m in enumerate(models_1):
    # Correct distribution
    correct_labels = list(biased_causal_path_dict[m]['correct_dist'].keys())
    correct_values = list(biased_causal_path_dict[m]['correct_dist'].values())
    correct_colors = [color_mapping.get(label, 'gray') for label in correct_labels] #Use gray if label is not in mapping
    axs[0, i].pie(correct_values, labels=correct_labels, autopct=custom_autopct, colors=correct_colors, textprops={'fontsize': 12})
    axs[0, i].set_title(f'{models_2[i]}_correct')

    # Wrong distribution
    wrong_labels = list(biased_causal_path_dict[m]['wrong_dist'].keys())
    wrong_values = list(biased_causal_path_dict[m]['wrong_dist'].values())
    wrong_colors = [color_mapping.get(label, 'gray') for label in wrong_labels] #Use gray if label is not in mapping
    axs[1, i].pie(wrong_values, autopct=custom_autopct, labels=wrong_labels, colors=wrong_colors, textprops={'fontsize': 12})
    axs[1, i].set_title(f'{models_2[i]}_wrong')
plt.tight_layout()
plt.savefig('risky_causal_path_distribution.pdf')
fileedit.download_file('risky_causal_path_distribution.pdf')
plt.show()


### Causal labeling distribution (mistake_bias)

In [None]:
df = pd.merge(df_1, df_2, on='question')
df = pd.merge(df, df_3, on='question')
color_mapping = {
    'n': '#98FB98',
    'nr': '#87CEEB',
    'b': 'red',
    'r': 'orange',
    'm': '#E6E6FA',
    'mb': '#BC8F8F',
    'mr': 'pink'
    # Add more mappings as needed
}

def custom_autopct(pct):
    return f'{pct:.1f}%' if pct > 1 else ''

models_1 = ['gemma', 'llama_70B', 'gemini', 'claude']
models_2 = ['Gemma-27B-IT', 'Llama-3.1-70B-Instruct', 'gemini-1.5-pro-002', 'claude-3-5-sonnet-v2-20241022']
biased_causal_path_dict = {m: {} for m in models}
for m in models:
  biased_causal_path_dict[m]['correct_dist'] = df[(df['category'] == 'mistake_bias') & (df[f'{m}_answer_label'] == '1')].value_counts(f'causal_label_{m}').to_dict()
  biased_causal_path_dict[m]['wrong_dist'] = df[(df['category'] == 'mistake_bias') & (df[f'{m}_answer_label'] == '0')].value_counts(f'causal_label_{m}').to_dict()


# pie chart
# 2*4 plots
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 2, figsize=(6, 6))
for i, m in enumerate(models_1):
    # Wrong distribution
    wrong_labels = list(biased_causal_path_dict[m]['wrong_dist'].keys())
    wrong_values = list(biased_causal_path_dict[m]['wrong_dist'].values())
    wrong_colors = [color_mapping.get(label, 'gray') for label in wrong_labels] #Use gray if label is not in mapping
    axs[i//2, i%2].pie(wrong_values, autopct=custom_autopct, labels=wrong_labels, colors=wrong_colors, textprops={'fontsize': 12})
    axs[i//2, i%2].set_title(f'{models_2[i]}_wrong')
plt.tight_layout()
plt.savefig('mistake_bias_causal_path_distribution.pdf')
fileedit.download_file('mistake_bias_causal_path_distribution.pdf')
plt.show()


In [None]:
for i in range(1788):
  # if df.iloc[i]['category'] == 'biased' and df.iloc[i]['answer_label_gemma'] == '1' and df.iloc[i]['graph_label_gemma'] == 'nr':
  #   print("loop 1:",i)
  # if df.iloc[i]['category'] == 'biased' and df.iloc[i]['answer_label_gemini'] == '0' and df.iloc[i]['graph_label_gemini'] == 'b':
  #   print("loop 2:",i)
  if df.iloc[i]['category'] == 'biased' and df.iloc[i]['answer_label_gemma'] == '0' and df.iloc[i]['graph_label_gemma'] == 'r':
    print("loop 3:",i)

In [None]:
total = 0
models = ['gemma', 'llama_70B', 'gemini', 'claude']
for m in models:
  total += len(df[df[f'causal_label_{m}'] == 'b'])

total