In [1]:
import os, sys
import numpy as np
import pandas as pd
from glob import glob
from sklearn.model_selection import train_test_split

In [2]:
def compute_statistics(pred1, pred2, n=10000):
    acc1, acc2 = sum(pred1) / len(pred1), sum(pred2) / len(pred2)
    ref_diff_acc = abs(acc1 - acc2)
    preds = pred1 + pred2
    
    occurences = 0
    for i in range(n):
        pred_x, pred_y = train_test_split(preds, test_size=len(pred2), random_state=i)
        acc_x, acc_y = sum(pred_x) / len(pred_x), sum(pred_y) / len(pred_y)
        diff_acc = abs(acc_x - acc_y)
        if diff_acc >= ref_diff_acc:
            occurences += 1
    
    return occurences / n

# ChatGPT Significance

In [3]:
for path in glob('outputs_chatgpt_eval/*.csv'):
    task = path.split('/')[-1].split('_falcon-')[0]
    model_name = 'falcon-' + path.split('/')[-1].split('_falcon-')[1][:-4]    
    df = pd.read_csv(path)

In [4]:
def check_correct(row):
    if str(row['Gold'])[0] == '[':
        return str(row['Pred']) in map(lambda x: str(x), eval(row['Gold']))
    else:
        return str(row['Pred']) == str(row['Gold'])

pred_data = {}
acc_data = {'task': [], 'model': [], 'accuracy': []}
for path in glob('outputs_chatgpt_eval/*.csv'):
    task = path.split('/')[-1].split('_falcon-')[0]
    model_name = 'falcon-' + path.split('/')[-1].split('_falcon-')[1][:-4]    
    if ('lang' not in model_name) or ('covid' in task):
        continue
    model, n_lang = model_name.split('-lang-')
        
    df = pd.read_csv(path)
    if task == 'ecare':
        df['Gold'] = df['Gold'].apply(lambda x: 'A' if str(x) == '0' else 'B')
    df['correct'] = df.apply(check_correct, axis='columns')
    
    if task not in pred_data:
        pred_data[task] = {}    
    if model not in pred_data[task]:
        pred_data[task][model] = {}
    pred_data[task][model][int(n_lang)] = df['correct'].tolist()
    
    acc_data['task'].append(task)
    acc_data['model'].append(model_name)
    acc_data['accuracy'].append(df['correct'].sum() / len(df))

In [5]:
acc_df = pd.DataFrame(acc_data)
acc_df.to_csv('chatgpt_eval_metric_final.csv', index=False)
acc_df.pivot(index='model', columns='task', values='accuracy')

task,alpha_nli,babi15,babi16,clutrr,commonsenseqa,ecare,pep_3k,sparta_qa_1reasoning,sparta_qa_2reasoning,step_game_basic,step_game_hard,timedial
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
falcon-40b-lang-1,0.783589,0.224,0.621,0.296684,0.719902,0.662111,0.561039,0.417625,0.327189,0.237,0.113,0.26971
falcon-40b-lang-10,0.735861,0.227,0.504,0.247818,0.525799,0.639491,0.535065,0.402299,0.373272,0.229,0.114,0.494467
falcon-40b-lang-45,0.631906,0.259,0.549,0.243455,0.447993,0.614515,0.513312,0.421456,0.322581,0.224,0.111,0.251037
falcon-7b-lang-1,0.61883,0.207,0.518,0.199825,0.260442,0.562677,0.527273,0.32567,0.336406,0.17,0.086,0.33195
falcon-7b-lang-10,0.525989,0.216,0.508,0.196335,0.209664,0.51508,0.521753,0.35249,0.331797,0.15,0.104,0.82296
falcon-7b-lang-2,0.575678,0.302,0.518,0.198953,0.23751,0.511781,0.524026,0.344828,0.368664,0.158,0.099,0.483402
falcon-7b-lang-20,0.517489,0.227,0.491,0.158813,0.187551,0.510368,0.509091,0.302682,0.359447,0.15,0.108,0.159751
falcon-7b-lang-3,0.516509,0.24,0.511,0.185864,0.213759,0.521678,0.542857,0.37931,0.331797,0.147,0.105,0.878976
falcon-7b-lang-45,0.520105,0.231,0.517,0.121291,0.192465,0.518379,0.497078,0.283525,0.327189,0.138,0.098,0.271093
falcon-7b-lang-5,0.519124,0.211,0.496,0.169284,0.206388,0.509425,0.531494,0.333333,0.345622,0.155,0.115,0.470954


### Group per Reasoning Type

In [6]:
task_to_reasoning_type = {
    'babi15': 'Deductive',
    'babi16': 'Inductive',
    'clutrr': 'Inductive',
    'step_game_basic': 'Spatial',
    'step_game_hard': 'Spatial',
    'pep_3k': 'Commonsense',
    'alpha_nli': 'Abductive',
    'timedial': 'Temporal',
    'sparta_qa_1reasoning': 'Spatial',
    'sparta_qa_2reasoning': 'Spatial',
    'commonsenseqa': 'Commonsense',
    'ecare': 'Causal',
    'covid_fact_scientific': 'Fact Checking',
    'covid_fact_social': 'Fact Checking',
}

In [7]:
reasoning_data = {}

# Iterate over tasks
for task, model_data  in pred_data.items():
    reasoning_type = task_to_reasoning_type[task]
    # Iterate over models
    for model, lang_data in model_data.items():
        # Iterate over n_languages
        for n_lang, preds in lang_data.items():
            if reasoning_type not in reasoning_data:
                reasoning_data[reasoning_type] = {}
            if model not in reasoning_data[reasoning_type]:
                reasoning_data[reasoning_type][model] = {}
            if n_lang not in reasoning_data[reasoning_type][model]:
                reasoning_data[reasoning_type][model][n_lang] = []
            reasoning_data[reasoning_type][model][n_lang] += preds

### Statistical Test

In [8]:
stats_data = {'reasoning_type': [], 'model': [], 'probability': []}
# Iterate over tasks
for reasoning, model_data  in reasoning_data.items():
    # Iterate over models
    for model, lang_data in model_data.items():
        # Compute
        lang1, pred1 = 1, lang_data[1]
        lang_keys = list(lang_data.keys())
        for i in range(len(lang_keys)):
            lang2, pred2 = lang_keys[i], lang_data[lang_keys[i]]
            if lang1 == lang2:
                continue
            proba = compute_statistics(pred1, pred2)
            stats_data['reasoning_type'].append(reasoning)
            stats_data['model'].append(f'{model}-lang-{lang2}')
            stats_data['probability'].append(proba)
stats_df = pd.DataFrame(stats_data)

In [9]:
stats_df

Unnamed: 0,reasoning_type,model,probability
0,Spatial,falcon-7b-lang-5,0.5279
1,Spatial,falcon-7b-lang-10,0.9129
2,Spatial,falcon-7b-lang-45,0.2178
3,Spatial,falcon-7b-lang-20,1.0
4,Spatial,falcon-7b-lang-3,0.7687
5,Spatial,falcon-7b-lang-2,0.6499
6,Spatial,falcon-40b-lang-45,0.6268
7,Spatial,falcon-40b-lang-10,1.0
8,Temporal,falcon-7b-lang-5,0.0
9,Temporal,falcon-7b-lang-20,0.0


In [10]:
stats_df.to_csv('significance/chatgpt.csv', index=False)

# MMLU Significance

In [11]:
mmlu_cat_df = pd.read_csv('mmlu_category.csv')
mmlu_cat_df = mmlu_cat_df.set_index('subject')

model_to_index = {
    'falcon-40b': 0,
    'falcon-40b-lang-1': 1,
    'falcon-40b-lang-10': 2,
    'falcon-40b-lang-45': 3,
    'falcon-7b': 4,
    'falcon-7b-lang-1': 5,
    'falcon-7b-lang-2': 6,
    'falcon-7b-lang-3': 7,
    'falcon-7b-lang-5': 8,
    'falcon-7b-lang-10': 9,
    'falcon-7b-lang-20': 10,
    'falcon-7b-lang-45': 11
}

In [12]:
data = []
category_preds_data, level_preds_data = {}, {}
for path in glob('results/results_falcon-*/*.csv'):
    if 'baseline' in path:
        continue
    model = path.split('results_')[1].split('/')[0]
    subject = path.split('/')[-1][:-4]
    category = mmlu_cat_df.loc[subject, 'category']
    model, n_lang = model.split('-lang-')
    
    # Get Level
    if 'high_school' in path:
        level = 'high_school'
    elif 'college' in path:
        level = 'college'
    elif 'professional' in path:
        level = 'professional'
    elif 'elementary' in path:
        level = 'elementary'
    else:
        level = 'other'
        
    # Compute Correctnesss & Accuracy
    df = pd.read_csv(path)
    num_correct = df.iloc[:,6].sum() 
    num_data = df.shape[0]
    accuracy = num_correct / num_data
    
    if category not in category_preds_data:
        category_preds_data[category] = {}
    if level not in level_preds_data:
        level_preds_data[level] = {}
    
    if model not in category_preds_data[category]:
        category_preds_data[category][model] = {}
    if model not in level_preds_data[level]:
        level_preds_data[level][model] = {}
        
    if int(n_lang) not in category_preds_data[category][model]:
        category_preds_data[category][model][int(n_lang)] = []
    if int(n_lang) not in level_preds_data[level][model]:
        level_preds_data[level][model][int(n_lang)] = []

    category_preds_data[category][model][int(n_lang)] += df.iloc[:,6].tolist()
    level_preds_data[level][model][int(n_lang)] += df.iloc[:,6].tolist()
    # preds_data.append('')
    # data.append({
    #     'model': model, 'model_index': model_index, 'level': level, 'subject': subject, 'category': category, 
    #     'num_correct': num_correct, 'num_data': num_data, 'accuracy': accuracy * 100
    # })
# df = pd.DataFrame(data)

### Statistical Test

##### Category

In [13]:
category_stats_data = {'category': [], 'model': [], 'probability': []}
# Iterate over tasks
for category, model_data  in category_preds_data.items():
    # Iterate over models
    for model, lang_data in model_data.items():
        # Compute
        lang1, pred1 = 1, lang_data[1]
        lang_keys = list(lang_data.keys())
        for i in range(len(lang_keys)):
            lang2, pred2 = lang_keys[i], lang_data[lang_keys[i]]
            # if lang1 == lang2:
            #     continue
            proba = compute_statistics(pred1, pred2)
            category_stats_data['category'].append(category)
            category_stats_data['model'].append(f'{model}-lang-{lang2}')
            category_stats_data['probability'].append(proba)
category_stats_df = pd.DataFrame(category_stats_data)

In [14]:
category_stats_df.to_csv('significance/category_mmlu.csv', index=False)

In [15]:
category_stats_df

Unnamed: 0,category,model,probability
0,stem,falcon-7b-lang-3,0.4989
1,stem,falcon-7b-lang-20,0.0646
2,stem,falcon-7b-lang-45,0.2168
3,stem,falcon-7b-lang-5,0.952
4,stem,falcon-7b-lang-1,1.0
5,stem,falcon-7b-lang-2,0.8158
6,stem,falcon-7b-lang-10,0.005
7,stem,falcon-40b-lang-1,1.0
8,stem,falcon-40b-lang-10,0.1676
9,stem,falcon-40b-lang-45,0.0497


##### Level

In [16]:
level_stats_data = {'level': [], 'model': [], 'probability': []}
# Iterate over tasks
for level, model_data  in level_preds_data.items():
    # Iterate over models
    for model, lang_data in model_data.items():
        # Compute
        lang1, pred1 = 1, lang_data[1]
        lang_keys = list(lang_data.keys())
        for i in range(len(lang_keys)):
            lang2, pred2 = lang_keys[i], lang_data[lang_keys[i]]
            # if lang1 == lang2:
            #     continue
            proba = compute_statistics(pred1, pred2)
            level_stats_data['level'].append(level)
            level_stats_data['model'].append(f'{model}-lang-{lang2}')
            level_stats_data['probability'].append(proba)
level_stats_df = pd.DataFrame(level_stats_data)

In [17]:
level_stats_df.to_csv('significance/level_mmlu.csv', index=False)

In [18]:
level_stats_df

Unnamed: 0,level,model,probability
0,other,falcon-7b-lang-3,0.4163
1,other,falcon-7b-lang-20,0.6316
2,other,falcon-7b-lang-45,0.1805
3,other,falcon-7b-lang-5,0.4001
4,other,falcon-7b-lang-1,1.0
5,other,falcon-7b-lang-2,0.3076
6,other,falcon-7b-lang-10,0.7269
7,other,falcon-40b-lang-1,1.0
8,other,falcon-40b-lang-10,0.5727
9,other,falcon-40b-lang-45,0.0014


In [27]:
data = []
cat = {}
for path in glob('results/results_falcon-40b-lang-1/*.csv'):
    if 'baseline' in path:
        continue
    model = path.split('results_')[1].split('/')[0]
    subject = path.split('/')[-1][:-4]
    category = mmlu_cat_df.loc[subject, 'category']
    model, n_lang = model.split('-lang-')
    
    # Get Level
    if 'high_school' in path:
        level = 'high_school'
    elif 'college' in path:
        level = 'college'
    elif 'professional' in path:
        level = 'professional'
    elif 'elementary' in path:
        level = 'elementary'
    else:
        level = 'other'
        
    # Compute Correctnesss & Accuracy
    df = pd.read_csv(path)
    num_correct = df.iloc[:,6].sum() 
    num_data = df.shape[0]
    accuracy = num_correct / num_data
    
    if category + '-' + level not in cat:        
        cat[category + '-' + level] = 0
    cat[category + '-' + level] += num_data
print(cat)

{'stem-other': 979, 'other-other': 2380, 'stem-college': 546, 'other-college': 173, 'social_sciences-other': 901, 'stem-elementary': 378, 'humanities-other': 2565, 'stem-high_school': 1250, 'humanities-high_school': 606, 'social_sciences-high_school': 1564, 'other-professional': 554, 'humanities-professional': 1534, 'social_sciences-professional': 612}


(171, 11)