# SBIC

In [1]:
import json, os, re
import pandas as pd
import numpy as np
from glob import glob

In [2]:
TASK = "css_data/sbic"
baseline = f"{TASK}/T5-finetune-sbic_predict.json"
with open(baseline, "r") as infile:
    base_json = json.load(infile)
sbic = pd.read_csv('css_data/sbic/sbic.csv')

In [3]:
sbic_test = []
for idx, lbl in enumerate(base_json['labels']):
    pred = base_json['predictions'][idx]
    pred = pred.split(", ")[0].replace("[", "").replace('"', "")
    consider = sbic[sbic["targetStereotype"]==lbl].copy()
    consider['Generated'] = pred
    consider['Model'] = "baseline"
    consider['Task'] = "sbic"
    if len(consider)>1:
        print("\tnon-unique", lbl)
    if len(consider)==1:
        sbic_test.append(consider)
    else:
        print("no match", lbl)
    #sbic_test.append(None)

	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims."]
no match ["trivializes harm to victims."]
	non-unique ["trivializes harm to victims."]
no match ["trivializes harm to victims."]
	non-unique ["are marginalized for a joke"]
no match ["are marginalized for a joke"]
	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims"]
no match ["trivializes harm to victims"]
	non-unique ["trivializes harm to victims."]
no match ["trivi

In [4]:
hit_df = pd.concat(sbic_test)[['post', 'targetStereotype', 'targetMinority', 'Generated', 'Model', 'Task']].sample(frac=1, random_state=7).copy()
hit_df.to_csv('hit/input/sbic/sbic_baseline.csv', index=False)

In [5]:
for fn in glob("css_data/sbic/answer*"):
    model = "-".join(fn.split("-")[1:])
    print(model)
    df = pd.read_csv(fn, sep='\t', names=['idx', 'targetStereotype', 'Generated'])
    
    sbic_test = []
    for _, row in df.iterrows():
        pred = row["Generated"].replace("&", "")
        lbl = row["targetStereotype"]
        consider = sbic[sbic["targetStereotype"]==lbl].copy()
        consider['Generated'] = pred
        consider['Model'] = model
        consider['Task'] = "sbic"
        if len(consider)==1:
            sbic_test.append(consider)
        else:
            print("\tno match", lbl)
    hit_df = pd.concat(sbic_test)[['post', 'targetStereotype', 'targetMinority', 'Generated', 'Model', 'Task']].sample(frac=1, random_state=7).copy()
    hit_df.to_csv(f'hit/input/sbic/sbic_{model}.csv', index=False)

text-ada-001
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims."]
	no match ["trivializes harm to victims."]
	no match ["are marginalized for a joke"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims."]
text-babbage-001
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims."]
	no match ["trivializes harm to victims."]
	no match ["are marginalized for a joke"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]
	no match ["trivializes harm to victims"]

# MRF

In [1]:
import json, os, re
import pandas as pd
import numpy as np
from glob import glob

In [2]:
TASK = "css_data/mrf"
baseline = f"{TASK}/T5-finetune-mrf-explain.json"
with open(baseline, "r") as infile:
    base_json = json.load(infile)
mrf = pd.read_csv('css_data/mrf/mrf.csv')

In [3]:
mrf_test = []
for idx, lbl in enumerate(base_json['labels']):
    pred = base_json['predictions'][idx]
    pred = pred.split(", ")[0].replace("[", "").replace('"', "")
    consider = mrf[mrf["writer_intent"]==lbl].copy()
    consider['Generated'] = pred
    consider['Model'] = "baseline"
    consider['Task'] = "mrf"
    consider['misinfo'] = consider['gold_label']
    if len(consider)>1:
        print("\tnon-unique", lbl)
    if len(consider)==1:
        mrf_test.append(consider)
    else:
        print("no match", lbl)
    #sbic_test.append(None)

	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['covid-19 is not real']
no match ['covid-19 is not real']
	non-unique ['unknown intent']
no match ['unknown intent']
	non-unique ['unknown intent']
no match ['un

In [4]:
hit_df = pd.concat(mrf_test)[['headline', 'writer_intent', 'misinfo', 'Generated', 'Model', 'Task']].sample(frac=1, random_state=7).copy()
hit_df.to_csv('hit/input/mrf/mrf_baseline.csv', index=False)

In [5]:
for fn in glob("css_data/mrf/answer-explanation*"):
    model = "-".join(fn.split("-")[2:])
    print(model)
    with open(f"css_data/mrf/prompts.json-explanation-{model}", "r") as f:
        prompts = json.load(f)
        
    df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)

    mrf_test = []
    for _, row in df.iterrows():
        if type(row["Generated"])==str:
            pred = row["Generated"].replace("&", "")
            headline = ""
            if str(row["idx"]) in prompts:
                headline = prompts[str(row["idx"])].split('\n')[0].strip()
            
            lbl = row["writer_intent"]
            consider = mrf[mrf["headline"]==headline].copy()
            
            if (not len(consider)) or (consider["writer_intent"].iloc[0]!=lbl):
                consider = mrf[mrf["writer_intent"]==lbl].copy()
            
            consider['Generated'] = pred
            consider['Model'] = model
            consider['Task'] = "mrf"
            consider['misinfo'] = consider['gold_label']
            
            if len(consider)==1:
                if consider["writer_intent"].iloc[0]==lbl:
                    mrf_test.append(consider)
                else:
                    print("Mismatch", model, row["idx"])
                    break
            elif len(consider)>1:
                print("\tno match", headline, '\t', lbl)
            else:
                pass
    if len(mrf_test):
        hit_df = pd.concat(mrf_test)[['headline', 'writer_intent', 'misinfo', 'Generated', 'Model', 'Task']].sample(frac=1, random_state=7).copy()
        hit_df.to_csv(f'hit/input/mrf/mrf_{model}.csv', index=False)
    
    

flan-t5-base




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


chatgpt




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)
Skipping line 21: expected 3 fields, saw 4
Skipping line 22: expected 3 fields, saw 4
Skipping line 54: expected 3 fields, saw 4
Skipping line 63: expected 3 fields, saw 4
Skipping line 64: expected 3 fields, saw 4
Skipping line 66: expected 3 fields, saw 5
Skipping line 107: expected 3 fields, saw 4
Skipping line 110: expected 3 fields, saw 5
Skipping line 155: expected 3 fields, saw 4
Skipping line 168: expected 3 fields, saw 4
Skipping line 172: expected 3 fields, saw 4
Skipping line 188: expected 3 fields, saw 4
Skipping line 193: expected 3 fields, saw 4
Skipping line 202: expected 3 fields, saw 4
Skipping line 218: expected 3 fields, saw 4
Skipping line 219: expected 3 fields, saw 5
Skipping line 223: expected 3 fields, saw 4
Skipping line 226: expected 3 fields, saw 4
Skipping line 228: expected 3 fields, saw 4
Skipping line 232: expected 3 fields, saw 4
Skipping line 235: expe

text-ada-001
	no match The coronavirus vaccine will cause other more serious diseases to emerge, said Anthony Fauci. 	 ['unknown intent']
	no match Otters Show How Predators Can Blunt Climate Damage 	 ['unknown intent']
	no match New Evidence That the Ancient Climate Was Warmer than Today's  'Roman Warming was the warmest in the last 2,000 years' 	 ['unknown intent']
	no match Punishing Companies For CO2 Emissions Won't Affect Temps, Climate 	 ['unknown intent']




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


	no match Climate report warns of rising risk as U.N. pushes nations to take action 	 ['unknown intent']
	no match The end of total quarantine: the Cabinet of Ministers of Ukraine has just made a decision. 	 ['unknown intent']
	no match 'The most significant climate legislation ever': How stimulus bill tackles warming planet 	 ['unknown intent']
	no match No 10 and Treasury clash over spending on environmental agenda 	 ['unknown intent']
	no match The Drilldown: Opposition disappointed with government's climate legislation 	 ['unknown intent']
	no match Councils reporting thousands of 'climate-related' incidents, including flooding 	 ['unknown intent']
	no match UK to make climate risk reports mandatory for large companies 	 ['unknown intent']
	no match French schooner Tara sets sail on scientific mission to study climate change 	 ['unknown intent']
	no match How climate change could benefit Russia 	 ['covid-19 is not real']
	no match Monetary Expansion Yielding Diminishing Returns And



  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


flan-ul2




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-babbage-001




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


flan-t5-small




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-davinci-001




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


flan-t5-xl




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


flan-t5-large




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-davinci-003




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


flan-t5-xxl




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-curie-001




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


In [None]:
q = "'Buddha would be green': Dalai Lama calls for urgent climate action"
mrf[mrf['headline']==q]

In [7]:
for _, row in df.iterrows():
    pred = row["Generated"].replace("&", "")
    lbl = prompts[str(row["idx"])].split('\n')[0].strip()
    break

In [12]:
pred, lbl

('Some people are reporting delays in getting the flu vaccine in Texas',
 'Texas sees delays amid push for faster vaccine rollout')

# FLUTE

In [1]:
import json, os, re
import pandas as pd
import numpy as np
from glob import glob

In [2]:
def get_premise_hypothesis(txt):
    premises = re.findall(r"premise: (.*?)(?:\n|$|hypothesis)", txt)
    hypotheses = re.findall(r"hypothesis: (.*?)(?:\n|$|premise)", txt)
    premise, hypothesis = "", ""
    if len(premises):
        premise = premises[0]
    if len(hypotheses):
        hypothesis = hypotheses[0]
    return premise.strip(), hypothesis.strip()

In [3]:
TASK = "css_data/flute"
baseline = f"{TASK}/T5-finetune-flute_predict.json"
with open(baseline, "r") as infile:
    base_json = json.load(infile)
with open("css_data/flute/flute-explanation.json", "r") as f:
    flute = pd.DataFrame.from_dict(json.load(f))

In [4]:
premises = []
hypotheses = []
for c in flute.context.values:
    p, h = get_premise_hypothesis(c)
    premises.append(p)
    hypotheses.append(h)
flute["premise"] = premises
flute["hypothesis"] = hypotheses

In [5]:
flute_test = []
for idx, lbl in enumerate(base_json['labels']):
    pred = base_json['predictions'][idx]
    pred_label, pred_expl = re.split(r"[&]+", pred)
    lbl_label, lbl_expl = re.split(r"[&]+", lbl)
    
    consider = flute[flute["additional_labels"]==lbl_expl].copy()
    consider['Generated'] = pred_expl
    consider['Generated_Label'] = pred_label
    consider['Model'] = "baseline"
    consider['Task'] = "flute"
    if len(consider)>0:
        flute_test.append(consider.iloc[0:1])
    else:
        print("no match", lbl)
    #sbic_test.append(None)

In [6]:
hit_df = pd.concat(flute_test)[['premise', 'hypothesis', 'labels', 'additional_labels', 'Generated', 'Generated_Label', 'Model', 'Task']].sample(frac=1, random_state=7).copy()
hit_df.to_csv('hit/input/flute/flute_baseline.csv', index=False)

In [7]:
for fn in glob("css_data/flute/answer-explain*"):
    model = "-".join(fn.split("-")[2:])
    print(model)
    with open(f"css_data/flute/prompts.json-explanation-{model}", "r") as f:
        prompts = json.load(f)
        
    df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)

    flute_test = []
    for _, row in df.iterrows():
        if type(row["Generated"])==str:
            pred = row["Generated"].replace("&", "")
            if str(row["idx"]) in prompts:
                prompt = prompts[str(row["idx"])]
                p, h = get_premise_hypothesis(prompt)
                consider = flute[(flute["premise"]==p) & (flute["hypothesis"]==h)].copy() 

                consider['Generated'] = pred
                consider['Model'] = model
                consider['Task'] = "flute"

                if len(consider)>0:
                    flute_test.append(consider.iloc[0:1])
                else:
                    print("\tno match", p, '\t', h)
            else:
                print("Misaligned", model)
                break
    if len(flute_test):
        hit_df = pd.concat(flute_test)[['premise', 'hypothesis', 'labels', 'additional_labels', 'Generated', 'Model', 'Task']].sample(frac=1, random_state=7).copy()
        hit_df.to_csv(f'hit/input/flute/flute_{model}.csv', index=False)

flan-t5-xl




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


flan-t5-small




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


flan-t5-base




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-curie-001




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-davinci-003




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


chatgpt
flan-t5-large




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-ada-001




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-babbage-001




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-davinci-002




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


text-davinci-001




  df = pd.read_csv(fn, sep='\t', names=['idx', 'writer_intent', 'Generated'], error_bad_lines=False)


In [4]:
with open('css_data/flute/prompts.json-explanation-text-davinci-003', 'r') as f: 
    dv3 = json.load(f)
df = pd.read_csv("hit/input/flute/flute_text-davinci-003.csv")
#df["Gen_DV3"] = df["Generated"]
df["Generated"] = [None for x in df.Generated.values]
df["Model"] = "chatgpt"
rerun = pd.read_csv('css_data/flute/answer-explanation-chatgpt-rerun', sep='\t', names=["idx", "Generated"])
for _, row in rerun.iterrows():
    prompt = dv3[str(row["idx"])]
    p, h = get_premise_hypothesis(prompt)
    idx = df[(df["premise"]==p) & (df["hypothesis"]==h)]
    if len(idx):
        df.loc[idx.index[0]]["Generated"] = row["Generated"]
df[~df["Generated"].isna()].to_csv("hit/input/flute/flute_chatgpt.csv", index=False)

## Combine

In [5]:
import json, os, re
import pandas as pd
import numpy as np
from glob import glob
from ast import literal_eval

In [2]:
# def build_comparison_rows(df, txt="post", txt2=None, gold="targetStereotype", secondary="targetMinority", 
#                           n=4, literal=True, random_state=7):
    
#     gold_df = {}
#     g = df.iloc[0][gold]
    
#     if literal:
#         g = literal_eval(g)
#         for i, t in enumerate(g):
#             gold_df[i] = {
#                 txt: df.iloc[0][txt],
#                 secondary: df.iloc[0][secondary],
#                 "Generated": t,
#                 "Model": "human",
#                 "Task": df.iloc[0]["Task"]
#             }
#             if txt2:
#                 gold_df[i][txt2] = df.iloc[0][txt2]
#     else:
#         gold_df = {0: {
#                 txt: df.iloc[0][txt],
#                 secondary: df.iloc[0][secondary],
#                 "Generated": g,
#                 "Model": "human",
#                 "Task": df.iloc[0]["Task"]   
#         }}
#         if txt2:
#             gold_df[0][txt2] = df.iloc[0][txt2]
#     gold_df = pd.DataFrame().from_dict(gold_df).T
#     comb = pd.concat([gold_df, df])
    
#     rand = comb.sample(frac=1, random_state=random_state)
#     rows = {}
#     for i in range(0, len(rand)-4, n):
#         row = {}
#         sub = rand.iloc[i:i+4]
#         j = 1
#         for _, r in sub.iterrows():
#             row[txt] = r[txt]
#             if txt2:
#                 row[txt2] = r[txt2]
#             row[secondary] = r[secondary]
#             row[f"Generated_{j}"] = r["Generated"]
#             row[f"Model_{j}"] = r["Model"]
#             row["Task"] = r["Task"]
#             j+=1
#         row[gold] = df.iloc[0][gold]
#         rows[i] = row
        
#     return pd.DataFrame().from_dict(rows).T

In [2]:
def build_comparison_rows(df, txt="post", txt2=None, gold="targetStereotype", secondary="targetMinority", 
                          n=4, literal=True, random_state=7):
    
    gold_df = {}
    g = df.iloc[0][gold]
    
    if literal:
        g = literal_eval(g)
        for i, t in enumerate(g):
            gold_df[i] = {
                txt: df.iloc[0][txt],
                secondary: df.iloc[0][secondary],
                "Generated": t,
                "Model": "human",
                "Task": df.iloc[0]["Task"]
            }
            if txt2:
                gold_df[i][txt2] = df.iloc[0][txt2]
            break
    else:
        gold_df = {0: {
                txt: df.iloc[0][txt],
                secondary: df.iloc[0][secondary],
                "Generated": g,
                "Model": "human",
                "Task": df.iloc[0]["Task"]   
        }}
        if txt2:
            gold_df[0][txt2] = df.iloc[0][txt2]
    gold_df = pd.DataFrame().from_dict(gold_df).T
    comb = pd.concat([gold_df, df])
    
    #print(set(comb.Model.values))
    #comb_alt = comb[[model in {"baseline", "flan-ul2", "text-davinci-003", "chatgpt"} for model in comb.Model.values]]
    comb = comb[[model in {"human", "baseline", "text-davinci-003", "chatgpt"} for model in comb.Model.values]]
    
    
    rand = comb.sample(frac=1)
    row = {}
    for j in range(0, 4):
        try:
            r = rand.iloc[j]
        except:
            print(j, "out of position")
            continue
            return pd.DataFrame()
        
        row[txt] = r[txt]
        if txt2:
            row[txt2] = r[txt2]
        row[secondary] = r[secondary]
        row[f"Generated_{j+1}"] = r["Generated"]
        row[f"Model_{j+1}"] = r["Model"]
        row["Task"] = r["Task"]
        row[gold] = df.iloc[0][gold]
    
    return pd.DataFrame().from_dict({0: row}).T

In [4]:
TASK = 'sbic'
txt="post"
gold="targetStereotype"
secondary="targetMinority"
n=4
literal=True
random_state=7

comb = []
for fn in glob(f"hit/input/{TASK}/*.csv"):
    comb.append(pd.read_csv(fn))
df = pd.concat(comb)

comparison_rows = []
for lbl in set(df[txt]):
    consider = df[df[txt]==lbl].copy()
    comparison_rows.append(build_comparison_rows(consider,
                                                 txt=txt,
                                                 gold=gold,
                                                 secondary=secondary,
                                                 n=n,
                                                 literal=literal,
                                                 random_state=random_state
                                                ))
    
comparison_df = pd.concat(comparison_rows)
comparison_df.sample(frac=1, random_state=random_state).to_csv(f"hit/input/comparison_balanced_{TASK}.csv", index=False)

In [5]:
comparison_df

Unnamed: 0,Generated_1,Generated_2,Generated_3,Generated_4,Model_1,Model_2,Model_3,Model_4,Task,post,targetMinority,targetStereotype
0,autistic people ahve potatoes for brains,autistic people are not worth anything,Autistic individuals are often stereotyped as ...,Autistic people are seen as being clumsy and a...,human,baseline,chatgpt,text-davinci-003,sbic,What do you get when an Autistic child gets in...,"[""folks with mental illness/disorder"", ""folks ...","[""autistic people ahve potatoes for brains"", ""..."
0,People with special needs are often seen as in...,Special education students are often perceived...,deserves to be made fun of,are inferior to the rest,text-davinci-003,chatgpt,human,baseline,sbic,Special ed classes are superior to the rest in...,"[""mentally disabled folks""]","[""deserves to be made fun of"", ""folks with men..."
0,Black people are unfairly targeted by law enfo...,black people are hated by the police,black folks are inferior,Police are biased towards white people,text-davinci-003,human,baseline,chatgpt,sbic,A white man and a black man play hide and seek...,"[""black folks"", ""black folks, native american/...","[""black people are hated by the police"", ""blac..."
0,whites are not afraid of other races,black folks are ignorant,White people are racially unaware,White people are racially unaware and need to ...,human,baseline,text-davinci-003,chatgpt,sbic,"No, lets stay scattered across the earth and t...","[""white folks""]","[""whites are not afraid of other races"", ""whit..."
0,Feminists are focused solely on highlighting m...,women are murderers,all feminists think men are murderers,Women are violent and dangerous,chatgpt,baseline,human,text-davinci-003,sbic,what is the point of feminism ? to remind us h...,"[""Feminists"", ""feminists"", ""women""]","[""all feminists think men are murderers"", ""all..."
...,...,...,...,...,...,...,...,...,...,...,...,...
0,This sentence communicates the stereotype that...,jewish people are inferior,jewish people are inferior,The stereotype communicated is that Jews are n...,chatgpt,human,baseline,text-davinci-003,sbic,Why do showerheads always have more than 11 ho...,"[""jewish folks""]","[""jewish people are inferior"", ""jewish people ..."
0,Women are often seen as inferior to computers ...,Women are not capable of processing or retaini...,women are beat up,women are stupid,text-davinci-003,chatgpt,human,baseline,sbic,What’s the difference between a women and a co...,"[""women""]","[""women are beat up"", ""women are dumb"", ""women..."
0,black folk sell drugs,African Americans are limited to menial labor ...,This sentence communicates the stereotype that...,black folks are a waste of time,human,text-davinci-003,chatgpt,baseline,sbic,Why do they put cotton at the top of pill bott...,"[""black folks""]","[""black folk sell drugs"", ""black folks pick co..."
0,Mexicans are seen as inferior to Americans,are not good citizens,Mexicans are often stereotyped as manual labor...,latino folks are always doing unskilled labor.,text-davinci-003,baseline,chatgpt,human,sbic,A Mexican and an American jump off a skyscrape...,"[""Mexicans"", ""latino/latina folks""]","[""latino folks are always doing unskilled labo..."


In [6]:
TASK = 'mrf'
txt="headline"
gold="writer_intent"
secondary="misinfo"
n=4
literal=True
random_state=7

comb = []
for fn in glob(f"hit/input/{TASK}/*.csv"):
    comb.append(pd.read_csv(fn))
df = pd.concat(comb)

comparison_rows = []
for lbl in set(df[txt]):
    consider = df[df[txt]==lbl].copy()
    cr = build_comparison_rows(consider,
                                 txt=txt,
                                 gold=gold,
                                 secondary=secondary,
                                 n=n,
                                 literal=literal,
                                 random_state=random_state
                                )
    if len(cr):
        comparison_rows.append(cr)
    
comparison_df = pd.concat(comparison_rows)
comparison_df.sample(frac=1, random_state=random_state).to_csv(f"hit/input/comparison_balanced_{TASK}.csv", index=False)

2 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
2 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
2 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
2 out of position
2 out of position
2 out of position
3 out of position
2 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of p

3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
2 out of position
3 out of position
3 out of position
3 out of p

In [7]:
comparison_df

Unnamed: 0,Generated_1,Generated_2,Generated_3,Generated_4,Model_1,Model_2,Model_3,Model_4,Task,headline,misinfo,writer_intent
0,Youth are taking action on climate change\n289...,the next generation cares more about the envir...,'young people are leading the charge for clima...,Youth are taking the initiative on climate change,chatgpt,human,baseline,text-davinci-003,mrf,Youth leading the charge for climate action at...,real,['the next generation cares more about the env...
0,End of total quarantine,Restrictions are easing,the pandemic is over,'the end of total quarantine',chatgpt,text-davinci-003,human,baseline,mrf,The end of total quarantine: the Cabinet of Mi...,misinfo,['the pandemic is over']
0,"""Vaccine has potential risks",'covid vaccine is dangerous',covid vaccines are dangerous,"The implied message is ""vaccines are not 100% ...",chatgpt,baseline,human,text-davinci-003,mrf,Two persons died after receiving the covid vac...,misinfo,"['covid vaccines are dangerous', 'the vaccine ..."
0,Government provided financial support during p...,The government is providing financial support,"'canadians were given $2,000 a month by the go...",us government aid is paltry.,chatgpt,text-davinci-003,baseline,human,mrf,"Canadians were given $2,000 a month by the gov...",misinfo,"['us government aid is paltry.', 'the amount o..."
0,a movie is out about climate change called cli...,Dr,Climate Hustle 2 is praised by Dr,'climate change is a big problem',human,text-davinci-003,chatgpt,baseline,mrf,Physicist Dr. Lubos Motl reviews the 'excellen...,misinfo,['a movie is out about climate change called c...
...,...,...,...,...,...,...,...,...,...,...,...,...
0,'the EU leaders agree to deeper cuts to greenh...,The message is: Take action now,the body thinks current standards weren't aggr...,Deeper emissions cuts are necessary,baseline,text-davinci-003,human,chatgpt,mrf,EU leaders agree to deeper cuts to greenhouse ...,real,"[""the body thinks current standards weren't ag..."
0,'colombia bought astrazeneca vaccines for 21 usd',the columbian government believes the astrazen...,The cost of vaccines matters,Colombia buys Astrazeneca vaccines at $21 per ...,baseline,human,text-davinci-003,chatgpt,mrf,The Colombian government bought the astrazenec...,misinfo,['the columbian government believes the astraz...
0,"The implied message is ""vaccines come with risks",vaccines are dangerous,'the vaccine is not safe',Vaccine testing comes with risks\n396\t['polic...,text-davinci-003,human,baseline,chatgpt,mrf,Death continues to be a regular guest at those...,misinfo,"['vaccines are dangerous', 'vaccines can cause..."
0,The implied message is: Climate change is a se...,a third of uk's produce comes from countries a...,'one-third of UK's fruit and vegetables import...,Climate change threatens UK's imported produce,text-davinci-003,human,baseline,chatgpt,mrf,One-third of UK's fruit and vegetables importe...,real,"[""a third of uk's produce comes from countries..."


In [3]:
TASK = 'flute'
txt="premise"
txt2="hypothesis"
gold="additional_labels"
secondary="labels"
n=4
literal=False
random_state=7

comb = []
for fn in glob(f"hit/input/{TASK}/*.csv"):
    if 'flan' in fn:
        continue
    comb.append(pd.read_csv(fn))
df = pd.concat(comb)

comparison_rows = []
phs = set([(row['premise'],row['hypothesis']) for _, row in df.iterrows()])
for p,h in phs:
    consider = df[(df['premise']==p) & (df['hypothesis']==h)].copy()
    cr = build_comparison_rows(consider,
                                 txt=txt,
                                 txt2=txt2,
                                 gold=gold,
                                 secondary=secondary,
                                 n=n,
                                 literal=literal,
                                 random_state=random_state
                                )
    if len(cr):
        comparison_rows.append(cr)
    
comparison_df = pd.concat(comparison_rows)
comparison_df.sample(frac=1, random_state=random_state).to_csv(f"hit/input/comparison_balanced_{TASK}.csv", index=False)

2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
3 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of position
2 out of position
3 out of p

In [4]:
comparison_df

Unnamed: 0,Generated_1,Generated_2,Generated_3,Generated_4,Model_1,Model_2,Model_3,Model_4,Task,additional_labels,hypothesis,labels,premise
0,The good samaritan is someone who does not hel...,The figurative language in the hypothesis is a...,"The phrase ""good samaritan"" in the hypothesis ...",A good Samaritan is a person who selflessly he...,baseline,text-davinci-003,text-davinci-003,human,flute,A good Samaritan is a person who selflessly he...,And we shouldnt forget how jesus explained tha...,Contradiction,And we shouldn't forget how Jesus explained th...
0,The weather is often very depressing and so it...,The figurative language in the hypothesis is s...,Bad weather is not a nice thing because it can...,The hypothesis is using figurative language to...,baseline,text-davinci-003,human,text-davinci-003,flute,Bad weather is not a nice thing because it can...,Isn't it nice when the bad weather is so depre...,Contradiction,The weather is terrible here today and its has...
0,The hypothesis is using figurative language to...,A social anxiety is a very serious problem and...,People with social anxiety often feel overwhel...,The figurative language in the hypothesis is s...,text-davinci-003,baseline,human,text-davinci-003,flute,People with social anxiety often feel overwhel...,I was so happy with myself when my social anxi...,Contradiction,My social anxiety acted up and I failed at doi...
0,"The figurative language in the hypothesis is ""...",Airing one's opinion means to speak out about ...,To aire one's opinion on welfare means to give...,"The phrase ""aired her opinions"" is used figura...",text-davinci-003,human,baseline,text-davinci-003,flute,Airing one's opinion means to speak out about ...,She aired her opinions on welfare.,Entailment,She openly give her point of view on the less ...
0,The hypothesis states that the Senator was hos...,The Senator attacked the administration in his...,There is no figurative language in the hypothesis,The Senator was critical of the administration...,text-davinci-003,baseline,text-davinci-003,human,flute,The Senator was critical of the administration...,The Senator attacked the administration in his...,Entailment,The Senator criticized the administration in h...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,A photo of someone from 4 years ago is often a...,The hypothesis is figuratively expressing that...,Someone gaining weight over a period of 4 year...,There is no figurative language in the hypothe...,baseline,text-davinci-003,human,text-davinci-003,flute,Someone gaining weight over a period of 4 year...,After seeing a photo of myself from 4 years ag...,Contradiction,I'm feeling pretty bummed after seeing a photo...
0,The hypothesis implies that no matter how ofte...,The figurative language in the hypothesis is h...,Most people would think that it is not okay to...,,text-davinci-003,text-davinci-003,human,,flute,Most people would think that it is not okay to...,It's totally acceptable how often I have to te...,Contradiction,I am so tired of telling my teenage daughter t...
0,To be quickly absorbed into society means to b...,cast out typically connotes something that is ...,"The phrase ""quickly absorbed"" in the hypothesi...","The figurative language in the hypothesis is ""...",baseline,human,text-davinci-003,text-davinci-003,flute,cast out typically connotes something that is ...,The immigrants were quickly absorbed into soci...,Contradiction,The immigrants were slowly cast out of society.
0,The hypothesis is using figurative language to...,The constant barking of a dog can be really ir...,The figurative language in the hypothesis is h...,Bobbing at noises is not a peaceful thing to d...,text-davinci-003,human,text-davinci-003,baseline,flute,The constant barking of a dog can be really ir...,My dog barks at every noise he hears and it's ...,Contradiction,My dog wont stop barking at every little noise.


In [6]:
TASK = 'flute'
pd.read_csv(f"hit/input/comparison_{TASK}.csv")

Unnamed: 0,Generated_1,Generated_2,Generated_3,Generated_4,Model_1,Model_2,Model_3,Model_4,Task,additional_labels,hypothesis,labels,premise
0,The hypothesis is that the less fortunate have...,The hypothesis is an entailment of the premise...,Airing one's opinion means to speak out about ...,The relationship between the hypothesis and th...,text-babbage-001,text-davinci-002,human,text-ada-001,flute,Airing one's opinion means to speak out about ...,She aired her opinions on welfare.,Entailment,She openly give her point of view on the less ...
1,The hypothesis is that the pain is coming from...,The hypothesis contradicts the premise because...,Experiencing back pain is never a good thing a...,The relationship between the premise and the h...,text-babbage-001,text-davinci-002,human,text-ada-001,flute,Experiencing back pain is never a good thing a...,I've been having this back pain for a while no...,Contradiction,My back has been having sharp random pains thr...
2,The hypothesis is that Mella orders while bobb...,The figurative language in the hypothesis is t...,"No sweat means not a problem or difficulty, bu...",The relationship between the premise and the h...,text-babbage-001,text-davinci-002,human,text-ada-001,flute,"No sweat means not a problem or difficulty, bu...","Mella orders, while bobbing her skull up and d...",Contradiction,"Mella orders, while bobbing her skull up and d..."
3,The hypothesis suggests that the smile was a s...,The figurative language in the hypothesis is t...,"An angel is a being of divine nature, and they...",The relationship between the premise and the h...,text-babbage-001,text-davinci-002,human,text-ada-001,flute,"An angel is a being of divine nature, and they...","I looked down at him, and he smiled at me like...",Contradiction,He smiled treacherously up at me.
4,The hypothesis suggests that the protagonist f...,The hypothesis states that the person feels te...,It is natural to feel bad when you accidentall...,The relationship between the premise and the h...,text-babbage-001,text-davinci-002,human,text-ada-001,flute,It is natural to feel bad when you accidentall...,I felt terrible when I accidentally bumped int...,Entailment,I accidentally bumped into someone at the mall...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,The hypothesis implies that the flowers stoppe...,The hypothesis describes the flowers as if the...,"To dance means to move nimbly or gracefully, w...",The flowers dance in the gentle breeze,text-babbage-001,text-davinci-002,human,text-ada-001,flute,"To dance means to move nimbly or gracefully, w...",The flowers danced in the gentle breeze.,Contradiction,The flowers stopped in the gentle breeze.
494,The hypothesis suggests that the gaze is like ...,The hypothesis describes the premise using fig...,A strong flame is usually associated with bein...,The hypothesis uses a figurative language to d...,text-babbage-001,text-davinci-002,human,text-ada-001,flute,A strong flame is usually associated with bein...,"His gaze was like a steady, strong flame .",Contradiction,His gaze was neither blazing nor compassionate.
495,The hypothesis suggests that the best friend w...,The hypothesis is hyperbolic and is not meant ...,The beauty of Tahiti is often mentioned and th...,"The hypothesis uses a metaphor when it says ""M...",text-babbage-001,text-davinci-002,human,text-ada-001,flute,The beauty of Tahiti is often mentioned and th...,Tahiti is the most beautiful place on Earth an...,Contradiction,My best friend is going to Tahiti with his fam...
496,The hypothesis is that people around the apart...,The hypothesis is a metaphor for how the peopl...,Cigarette butts are often unsightly and produc...,The relationship between the premise and the h...,text-babbage-001,text-davinci-002,human,text-ada-001,flute,Cigarette butts are often unsightly and produc...,It's not only beautiful but also cool how peop...,Contradiction,I'm so mad at people around my apartment just ...


In [7]:
TASK = 'sbic'
pd.read_csv(f"hit/input/comparison_{TASK}.csv")

Unnamed: 0,post,targetMinority,Generated_1,Model_1,Task,Generated_2,Model_2,Generated_3,Model_3,Generated_4,Model_4,targetStereotype
0,What was the hardest part of the holocaust? My...,"[""Holocaust victims"", ""jewish folks""]",a group of people,flan-t5-small,sbic,The stereotype communicated by this sentence i...,text-davinci-001,"I'm sorry, I cannot provide an explanation for...",chatgpt,Jews are often viewed as victims of the Holocaust,text-davinci-003,"[""jewish people are inferior"", ""jewish people ..."
1,What was the hardest part of the holocaust? My...,"[""Holocaust victims"", ""jewish folks""]",This sentence communicates the stereotype that...,text-davinci-002,sbic,trivializes harm to victims,human,jewish people are unwanted,human,jewish people are inferior,human,"[""jewish people are inferior"", ""jewish people ..."
2,What was the hardest part of the holocaust? My...,"[""Holocaust victims"", ""jewish folks""]",The sentence communicates that the Holocaust w...,text-curie-001,sbic,The hardest part of the holocaust was the geno...,text-babbage-001,The hardest part of the holocaust was the lack...,text-ada-001,[GROUP] are sexist.,flan-ul2,"[""jewish people are inferior"", ""jewish people ..."
3,What was the hardest part of the holocaust? My...,"[""Holocaust victims"", ""jewish folks""]",Nazi,flan-t5-large,sbic,a dick,flan-t5-base,are dick,baseline,Jews are a hateful people,flan-t5-xxl,"[""jewish people are inferior"", ""jewish people ..."
4,"A Cop Told A Suspect To Put His Hands Up, And ...","[""black folks""]","A cop told a suspect to put his hands up, and ...",flan-t5-small,sbic,This sentence communicates the stereotype that...,text-davinci-001,[GROUP] are a group of people,flan-t5-xl,"The sentence ""Cop told suspect to put his hand...",text-ada-001,"[""black folks are inferior"", ""black folks dese..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1912,we should not help poor african children\nthey...,"[""African folks""]",The stereotype communicated by this sentence i...,text-curie-001,sbic,This sentence communicates the stereotype that...,text-davinci-003,[GROUP] are funny.,flan-ul2,poor,flan-t5-xxl,"[""african folks don't deserve to eat"", ""africa..."
1913,What do you call a woman who speaks their mind...,"[""women""]",shouldn't speak,human,sbic,women are c*nts,human,a woman who speaks her mind,flan-t5-large,This sentence communicates the stereotype that...,chatgpt,"[""end up dead"", ""shouldn't speak"", ""women are ..."
1914,What do you call a woman who speaks their mind...,"[""women""]",end up dead,human,sbic,[GROUP] are a group of people who are buried i...,flan-t5-xl,a woman who speaks their mind is a cunt,flan-t5-xxl,Women are often seen as being too outspoken or...,text-davinci-003,"[""end up dead"", ""shouldn't speak"", ""women are ..."
1915,What do you call a woman who speaks their mind...,"[""women""]",women are sex slaves,baseline,sbic,women,flan-t5-base,The stereotype that this sentence communicates...,text-davinci-001,The speaker is an individual who is passionate...,text-ada-001,"[""end up dead"", ""shouldn't speak"", ""women are ..."
