In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
dirpath = Path("../data")
dataset = "unhealthy"
filepath = dirpath / dataset / "test.csv"
df = pd.read_csv(filepath)
df.shape

(21406, 12)

In [3]:
def get_columns(dataset):
    if dataset == "go_emo":
        return sorted([
            'admiration','amusement', 'anger', 'annoyance', 'approval', 'caring',
            'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
            'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
            'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
            'remorse', 'sadness', 'surprise', 'neutral'
        ]), "rater_id", "text"
    if dataset == "unhealthy":
        return sorted([
            "antagonize", "condescending" , "dismissive", "generalisation",
            "generalisation_unfair", "healthy", "hostile", "sarcastic"
        ]), "_worker_id", "comment"
    if dataset == "docanno":
        return sorted([
            'inspiring', 'interesting', 'offensive_to_someone', 'negative',
            'offensive_to_me', 'political', 'positive', 'sadness', 'calm',
            'fear', 'compassion', 'disgust', 'vulgar', 'surprise', 'embarrasing',
            'anger', 'understandable', 'ironic', 'need_more_information',
            'happiness', 'delight', 'funny_to_someone', 'funny_to_me'
        ]), "user_id", "text"
    return None

In [4]:
label_columns, annotator_column, text_column = get_columns(dataset)
num_shots = 2

In [5]:
def parse_annotation_to_text(sample):
    response = ", ".join([label for label in label_columns if sample[label] == 1])
    return response

In [6]:
def get_examples(df, sample, num_shots=2):
    annotator_texts = df[df[annotator_column] == sample[annotator_column]].drop(sample["index"])
    if annotator_texts.shape[0] < num_shots:
        examples = annotator_texts.sample(frac=1)
    else:
        examples = annotator_texts.sample(num_shots)
    extracted_return = []
    for idx, ex in examples.iterrows():
        extracted_return.append(ex[text_column])
        extracted_return.append(parse_annotation_to_text(ex))
    while len(extracted_return) < 2*num_shots:
        extracted_return.append(None)
    return extracted_return

In [7]:
tqdm.pandas()
new_columns = sum([[f"example{i+1}", f"example{i+1}_response"] for i in range(num_shots)], start=[])
df[new_columns] = df.reset_index().progress_apply(lambda sample: get_examples(df, sample, 2), axis=1, result_type="expand")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21406/21406 [00:11<00:00, 1920.61it/s]


In [8]:
df.head()

Unnamed: 0,_unit_id,comment,_trust,_worker_id,antagonize,condescending,dismissive,generalisation,generalisation_unfair,healthy,hostile,sarcastic,example1,example1_response,example2,example2_response
0,2327213000.0,"No, Olivia Chow has nothing to do with David M...",0.9667,18960682,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,Maybe Canada should offer to bury this terrori...,healthy,Look at the facts. What was the debt pre PM Ha...,healthy
1,1739445000.0,"Comonsense, he disqualified himself when he fl...",0.9929,44022448,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Where do you think the money comes from? From ...,healthy,I think this writer read too much in Redford's...,healthy
2,1800633000.0,"Thanks for your considered opinion, so tempera...",0.9919,44083576,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...attacking the spouse or family of a politic...,healthy,Jordan - impressive breakdown of the problem b...,healthy
3,1739443000.0,When you start worrying about the 'human right...,0.95,44556582,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Hi Broom Closet. I thought you were going to g...,healthy,where did I apologize? Conservatives always si...,healthy
4,1739442000.0,Like watching a class c Hollywood production t...,0.9929,44022448,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Absolutely spot on. But don't expect the flogg...,healthy,I think this writer read too much in Redford's...,healthy


In [9]:
df.to_csv(filepath, index=False)