In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
dirpath = Path("../data")
dataset = "aggression"
filepath = dirpath / dataset / "test.csv"
df = pd.read_csv(filepath)
df.shape

(325339, 10)

In [3]:
def get_columns(dataset):
    if dataset == "go_emo":
        return sorted([
            'admiration','amusement', 'anger', 'annoyance', 'approval', 'caring',
            'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
            'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
            'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
            'remorse', 'sadness', 'surprise', 'neutral'
        ]), "rater_id", "text"
    if dataset == "unhealthy":
        return sorted([
            "antagonize", "condescending" , "dismissive", "generalisation",
            "generalisation_unfair", "healthy", "hostile", "sarcastic"
        ]), "_worker_id", "comment"
    if dataset == "docanno":
        return sorted([
            'inspiring', 'interesting', 'offensive_to_someone', 'negative',
            'offensive_to_me', 'political', 'positive', 'sadness', 'calm',
            'fear', 'compassion', 'disgust', 'vulgar', 'surprise', 'embarrasing',
            'anger', 'understandable', 'ironic', 'need_more_information',
            'happiness', 'delight', 'funny_to_someone', 'funny_to_me'
        ]), "user_id", "text"
    if dataset == "aggression":
        return sorted([
            "aggression"
        ]), "worker_id", "comment"
    return None

In [4]:
label_columns, annotator_column, text_column = get_columns(dataset)
num_shots = 2

In [8]:
def parse_annotation_to_text(sample):
    if dataset == "aggression":
        response = "true" if sample[label_columns[0]] == 1 else "false"
    else:
        response = ", ".join([label for label in label_columns if sample[label] == 1])
    return response

In [9]:
def get_examples(df, sample, num_shots=2):
    annotator_texts = df[df[annotator_column] == sample[annotator_column]].drop(sample["index"])
    if annotator_texts.shape[0] < num_shots:
        examples = annotator_texts.sample(frac=1)
    else:
        examples = annotator_texts.sample(num_shots)
    extracted_return = []
    for idx, ex in examples.iterrows():
        extracted_return.append(ex[text_column])
        extracted_return.append(parse_annotation_to_text(ex))
    while len(extracted_return) < 2*num_shots:
        extracted_return.append(None)
    return extracted_return

In [10]:
tqdm.pandas()
new_columns = sum([[f"example{i+1}", f"example{i+1}_response"] for i in range(num_shots)], start=[])
df[new_columns] = df.reset_index().progress_apply(lambda sample: get_examples(df, sample, 2), axis=1, result_type="expand")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 325339/325339 [03:55<00:00, 1378.60it/s]


In [11]:
df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,worker_id,aggression,aggression_score,example1,example1_response,example2,example2_response
0,155243,:If I may butt in I've spent the last 1/4 hou...,2002,True,user,random,test,144,0.0,0.0,RFA thanks Thanks for supporting me in my RFA.,False,a quick questionHi Jacob- As you know I am no ...,True
1,155243,:If I may butt in I've spent the last 1/4 hou...,2002,True,user,random,test,202,0.0,0.0,SirI am not a vandal. I am only trying to make...,False,:No problem. It was kinda fun once I got the h...,False
2,155243,:If I may butt in I've spent the last 1/4 hou...,2002,True,user,random,test,214,1.0,-1.0,Very good. You have finally replaced the black...,False,:::Please answer my questions. Please don't co...,False
3,155243,:If I may butt in I've spent the last 1/4 hou...,2002,True,user,random,test,240,0.0,0.0,::Not really this is my home and the other was...,False,I used to like Wikipedia but its assholes li...,True
4,155243,:If I may butt in I've spent the last 1/4 hou...,2002,True,user,random,test,297,0.0,0.0,"`, 19 December 2007 (UTC):Alex, it's these kin...",False,X-Men: The Last Stand Please don't alter the ...,False


In [12]:
df.to_csv(filepath, index=False)