# Preparedata
process and merge model outputs in [`model_output`](model_output) and save complete daqta to [`alldata`](alldata)

In [5]:
import pandas as pd
import numpy as np
import os
from os import listdir
import re
import scipy.stats as stats


In [6]:
# models
MODELS_INSTRUCT = ['llama-3.1-8b-instruct','llama-3.1-70b-instruct','llama-3.3-70b-instruct','qwen2.5-1.5b-instruct','qwen2.5-7b-instruct','qwen2.5-72b-instruct','mistral-large-instruct-2411']
MODELS_BASE = ['llama-3.1-8b','llama-3.1-70b','qwen2.5-1.5b','qwen2.5-7b','qwen2.5-72b', 'llama-3.1-405b','olmo-2-1124-7b','olmo-2-1124-7b-stage2-ingredient1-step11931-tokens50B','olmo-2-1124-7b-stage2-ingredient2-step11931-tokens50B','olmo-2-1124-7b-stage2-ingredient3-step11931-tokens50B', 'olmo-2-1124-13b', 'olmo-2-1124-13b-stage2-ingredient1-step11931-tokens100B','olmo-2-1124-13b-stage2-ingredient2-step11931-tokens100B','olmo-2-1124-13b-stage2-ingredient3-step11931-tokens100B']

MODELS = MODELS_INSTRUCT + MODELS_BASE

In [10]:
# helper functions, referred to https://github.com/jennhu/response-to-DGL/blob/87228acbc3f65b169f0ec3cdebef9eb7c1043398/notebooks/main.ipynb
def read_model_csvs(folder, model_substr=None):
    """Helper function for reading CSV files from a folder.
    Expects files to be named by model."""
    # get all files, and sort them
    files = sorted(listdir(folder))
    dfs = []
    for f in files:
        # Only read files containing the specified substring.
        if model_substr is not None and model_substr not in f:
            continue
        try:
            df = pd.read_csv(f"{folder}/{f}", )
            #extract model name and data name and load to df
            model, dataset = f.split('_', 1)
            dataset = dataset.replace('.csv', '')
            #print(f'READING DATA: {model} on {dataset}')
            df["model"] = model
            df["dataset"] = dataset
            df['model_family'] = 'qwen' if 'qwen'in model else model.split('-')[0]
            df['model_gen'] = '2.5' if 'qwen'in model else model.split('-')[1]
            df['model_size'] = float(re.search(r'(\d+(\.\d*)?)b', model).group(1)) if 'b' in model else 123
            df['model_type'] = 'instruct' if 'instruct' in model else 'base'
            dfs.append(df)
        except pd.errors.ParserError:
            print(f"Skipping file due to parsing error: {f}")
            continue
    
    data = pd.concat(dfs)
    return data

def try_index(df, index):
    """Helper function for trying to look up data from a dataframe."""
    try:
        return df.loc[index]
    except:
        return None
    

def sort_by_models (df, models):
    """Helper function for sorting data by models."""
    df['model'] = pd.Categorical(df['model'], models)
    df = df.sort_values('model')
    return df


# Experiment 1: Acceptability Judgements

In [11]:
def process_aj_data(folder='acceptability',models=None):
    """Helper function for reading and processing the acceptability judgment data."""
    df = read_model_csvs(f'model_output/{folder}')

    if models is not None:
        # Optionally subset models.
        df = df[df.model.isin(models)]
    
    df['sumLP_diff'] = df['direct_sum_score_gram'] - df['direct_sum_score_ungram']
    df['meanLP_diff'] = df['direct_mean_score_gram'] - df['direct_mean_score_ungram']
    df['sumLP_ans'] = df['direct_sum_score_correct']
    df['meanLP_ans'] = df['direct_mean_score_correct']


    # process prompt 1 to 5
    for i in range(1,6):
        df[f'prompt{i}_diff'] = df[f'prompt{i}_score_gram'] - df[f'prompt{i}_score_ungram']
        df[f'prompt{i}_ans'] = df[f'prompt{i}_correct_mean']
    
    # process prompt 6 8 10
    for i in [6,8,10]:
       pid=(i-5)//2+6
       df[f'prompt{pid}_gram'] = df[f'prompt{i}_score_gram'] - df[f'prompt{i}_score_ungram']
       df[f'prompt{pid}_ungram'] = df[f'prompt{i+1}_score_gram'] - df[f'prompt{i+1}_score_ungram']
       df[f'prompt{pid}_diff'] = df[f'prompt{pid}_gram'] - df[f'prompt{pid}_ungram']
       df[f'prompt{pid}_ans'] = df[f'prompt{pid}_gram'] > df[f'prompt{pid}_ungram']
    
    # rename models
    #df = model_rename(df)
    return df

In [12]:
all_aj_data = process_aj_data(models=MODELS)
all_aj_data.head()

Unnamed: 0,sentence_grammatical,sentence_ungrammatical,source,paradigm,phenomenon,LS_grammatical,LS_ungrammatical,LS_diff,pair_ID,direct_sum_score_gram,...,prompt6_diff,prompt6_ans,prompt7_gram,prompt7_ungram,prompt7_diff,prompt7_ans,prompt8_gram,prompt8_ungram,prompt8_diff,prompt8_ans
0,The drivers conceal those busy guests.,The drivers conceal those busy guest.,blimp,determiner_noun_agreement_with_adjective_1,determiner_noun_agreement,,,,0,-63.698022,...,0.679688,True,-0.40625,-0.75,0.34375,True,-0.507812,-0.796875,0.289062,True
1,Paul isn't talking about those scared pedestri...,Paul isn't talking about those scared pedestrian.,blimp,determiner_noun_agreement_with_adjective_1,determiner_noun_agreement,,,,1,-59.32958,...,1.429687,True,0.085938,-0.742188,0.828125,True,-0.171875,-0.523438,0.351562,True
2,A lot of actors discover that gray shoe.,A lot of actors discover that gray shoes.,blimp,determiner_noun_agreement_with_adjective_1,determiner_noun_agreement,,,,2,-57.87374,...,0.492188,True,-0.765625,-1.15625,0.390625,True,-0.742188,-0.875,0.132812,True
3,Benjamin messes up these black hospitals.,Benjamin messes up these black hospital.,blimp,determiner_noun_agreement_with_adjective_1,determiner_noun_agreement,,,,3,-58.381034,...,1.117188,True,-0.265625,-0.914063,0.648438,True,-0.15625,-0.929688,0.773438,True
4,Most organizations appreciate these good cafes.,Most organizations appreciate these good cafe.,blimp,determiner_noun_agreement_with_adjective_1,determiner_noun_agreement,,,,4,-55.305567,...,0.96875,True,0.046875,-0.875,0.921875,True,-0.21875,-0.6875,0.46875,True


### Convert to long

In [13]:
def convert_to_long(df, columns_keep,num_prompts):
    """Converts the dataframe to long format."""
    df_long = pd.melt(df, id_vars=columns_keep,value_vars=[f'prompt{i}_diff' for i in range(1,num_prompts+1)]+[f'prompt{i}_ans' for i in range(1,num_prompts+1)], var_name='metric', value_name='value')
    df_long['promptID'] = df_long['metric'].str.extract(r'(\d+)').astype(int)
    df_long_ans = df_long[df_long['metric'].str.contains('ans')]
    df_long_diff = df_long[df_long['metric'].str.contains('diff')]
    df_long = df_long_ans.merge(df_long_diff, on=columns_keep+['promptID'], suffixes=('_ans', '_diff'))
    df_long.rename(columns={'value_ans': 'ans', 'value_diff': 'diff'}, inplace=True)
    return df_long





In [14]:
expr1_columns = ['model', 'dataset', 'model_family', 'model_gen','model_size', 'model_type','pair_ID','sumLP_diff', 'meanLP_diff', 'sumLP_ans', 'meanLP_ans']
aj_long = convert_to_long(all_aj_data, expr1_columns, 8)
aj_long.to_csv('alldata/expr1.csv', index=False)

# Experiment 2 (word prediction)

In [15]:
def process_wp_data(folder='continuation',testsuite=None,models=None):
    df = read_model_csvs(f'model_output/{folder}')
    if testsuite is not None:
        df = df[df.dataset.isin(testsuite)]
    if models is not None:
        df = df[df.model.isin(models)]
    
    df['sumLP_diff'] = df['direct_score_1'] - df['direct_score_2']
    df['sumLP_ans'] = df['direct_correct']
    
    # process_prompts 1 to 4
    for i in range(1,5):
        pid=i
        df[f'prompt{pid}_diff'] = df[f'prompt{i}_score_gram'] - df[f'prompt{i}_score_ungram']
        df[f'prompt{pid}_ans'] = df[f'prompt{i}_correct']
    
    # process prompt 5 (we didn't use prompt 5 in our study as the . is missing)
    for i in [5]:
        pid = i
        df[f'prompt{pid}_gram'] = df[f'prompt{i}_score_gram'] - df[f'prompt{i}_score_ungram']
        df[f'prompt{pid}_ungram'] = df[f'prompt{i+1}_score_gram'] - df[f'prompt{i+1}_score_ungram']
        df[f'prompt{pid}_diff'] = df[f'prompt{pid}_gram'] - df[f'prompt{pid}_ungram']
        df[f'prompt{pid}_ans'] = df[f'prompt{pid}_gram'] > df[f'prompt{pid}_ungram']
    
    for i in [7,8]:
        pid = i-1
        df[f'prompt{pid}_diff'] = (df[f'prompt{i}_score_1'] + df[f'prompt{i}_score_1_reverse'] - df[f'prompt{i}_score_2'] - df[f'prompt{i}_score_2_reverse'])/2
        df[f'prompt{pid}_ans'] = df[f'prompt{pid}_diff'] > 0
        
    #df = model_rename(df)
    return df


In [16]:

all_wp_data = process_wp_data(models=MODELS)
all_wp_data.head()

Unnamed: 0,item_id,prefix,good_continuation,bad_continuation,log_freq,log_freq_alter,prediction_direct,prediction_prompt,direct_score_1,direct_score_2,...,prompt4_diff,prompt4_ans,prompt5_gram,prompt5_ungram,prompt5_diff,prompt5_ans,prompt6_diff,prompt6_ans,prompt7_diff,prompt7_ans
0,jabberwocky_0,"between crafty fur wee anaesthesia , yore `` b...",channel,backgrounds,8.828641,8.827468,,,-9.590733,-12.222569,...,-0.039062,False,-1.359375,-1.414062,0.054687,True,-0.935547,False,0.449219,True
1,jabberwocky_1,a hole nominate pride in afford teenage jacket...,fallacy,astute,5.676754,5.676754,,,-10.584092,-10.420748,...,-0.007812,False,-0.351562,-0.328125,-0.023438,False,0.02165,True,-0.051747,False
2,jabberwocky_2,i liked well compass fatal after encompass pur...,absent,aggressive,7.738924,7.738924,,,-9.230927,-10.662567,...,0.09375,True,-0.828125,-0.976562,0.148438,True,0.783203,True,0.009766,True
3,jabberwocky_3,thou sly splash contrive occupy sew all parami...,regiment,brute,7.474205,7.474205,,,-10.204615,-9.860865,...,0.070312,True,-0.734375,-0.742188,0.007813,True,0.75,True,3.669922,True
4,jabberwocky_4,that observe overhead attest some circuit unto...,turmoil,squeakers,6.276643,6.276643,,,-19.665796,-22.950417,...,0.054688,True,-0.570312,-0.5,-0.070312,False,-1.054291,False,-0.413825,False


In [17]:
wp_columns = ['model', 'dataset', 'model_family', 'model_gen','model_size', 'model_type','item_id','sumLP_diff', 'sumLP_ans']
wp_long = convert_to_long(all_wp_data, wp_columns, 7)
wp_long.rename(columns={'item_id': 'pair_ID'}, inplace=True)
wp_long.to_csv('alldata/expr2.csv', index=False)
