In [1]:
import os


def read_text(file_id, base_path='data/EN/raw-documents'):
    with open(os.path.join(base_path, f'{file_id}'), 'r', encoding='utf-8') as f:
        return f.read()

In [2]:
import pandas as pd
# method to prepare a label dataset for training
def prepare_dataset(file, subnarrative):
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['file_id', 'narratives', 'subnarratives']
    
    df['narratives'] = df['narratives'].apply(lambda x: x.split(';'))
    df['subnarratives'] = df['subnarratives'].apply(lambda x: x.split(';'))

    # get only unique values
    df['narratives'] = df['narratives'].apply(lambda x: list(set(x)))
    df['subnarratives'] = df['subnarratives'].apply(lambda x: list(set(x)))
    
    df['label'] = df['subnarratives'].apply(lambda x: 1 if subnarrative in x else 0)
    df['text'] = df['file_id'].apply(lambda x: read_text(x))
    
    df = df[['text', 'label']]
    
    return df

print(os.getcwd())

# method to extract the list of all unique subnarratives
def extract_subnarratives(file):
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['file_id', 'narratives', 'subnarratives']
    
    df['subnarratives'] = df['subnarratives'].apply(lambda x: x.split(';'))
    
    subnarratives = set()
    for i in df['subnarratives']:
        subnarratives.update(i)

    return list(subnarratives)


/home/twoface/PHD-Track


In [3]:
subnarratives = extract_subnarratives('data/EN/subtask-2-annotations.txt')

# prepare dataset for each subnarrative
datasets = {}

for subnarrative in subnarratives:
    df = prepare_dataset('data/EN/subtask-2-annotations.txt', subnarrative)
    datasets[subnarrative] = df

# print the number of labels at true for each subnarrative
for subnarrative, df in datasets.items():
    print(f'{subnarrative}: {df["label"].sum()}')

CC: Hidden plots by secret schemes of powerful groups: Other: 8
CC: Amplifying Climate Fears: Other: 1
CC: Questioning the measurements and science: Data shows no temperature increase: 2
URW: Discrediting Ukraine: Rewriting Ukraine’s history: 1
CC: Criticism of climate movement: Ad hominem attacks on key activists: 16
Other: 169
CC: Downplaying climate change: Humans and nature will adapt to the changes: 1
URW: Amplifying war-related fears: Russia will also attack other countries: 12
URW: Discrediting the West, Diplomacy: Other: 26
URW: Negative Consequences for the West: The conflict will increase the Ukrainian refugee flows to Europe: 2
URW: Distrust towards Media: Western media is an instrument of propaganda: 12
URW: Praise of Russia: Russia is a guarantor of peace and prosperity: 3
CC: Downplaying climate change: Ice is not melting: 2
URW: Praise of Russia: Other: 2
URW: Praise of Russia: Praise of Russian military might: 9
CC: Criticism of institutions and authorities: Criticism o

In [8]:
# function that prepares the jsonl file to finetune one agent for classification
import json


def prepare_finetuning_file(df, subnarrative, definition, example):
    system_prompt = ("You are a model trained to classify whether a text contains a specific subnarrative."
                     "You are given a text and you have to predict whether the text contains the subnarrative or not." 
                     "If the text contains the subnarrative, you should predict 1, otherwise you should predict 0. You are ONLY allowed to answer 1 or 0.")
    
    user_prompt = ("You are asked to predict whether the following text contains the subnarrative: {} "
                     "Here is a definition of the subnarrative: {} "
                     "Here are some examples that contain the subnarrative: {} "
                     "Here is the text that you have to classify: {} "
                     "Answer 1 if the text contains the subnarrative {}, otherwise answer 0.")
    
    lines = []

    for index, row in df.iterrows():
        text = row['text']
        label = row['label']
        
        line = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt.format(subnarrative, definition, example, text, subnarrative)},
                {"role": "assistant", "content": str(label)}
            ]
        }
        lines.append(line)

    with open(f'data/EN/finetuning/{subnarrative}.jsonl', 'w') as f:
        for line in lines:
            json_line = json.dumps(line)
            f.write(json_line + '\n')


In [9]:

test_df = datasets['CC: Criticism of climate movement: Ad hominem attacks on key activists']
subnarrative = 'CC: Criticism of climate movement: Ad hominem attacks on key activists'
definition = 'Statements attacking the reputation of key figures (such as scientists, activists, politicians or public figures).'
example = 'Greta Thunberg has not worked half a day of her life, but is making claims why I need to change my diet to plant-based.'
prepare_finetuning_file(test_df, subnarrative, definition, example=example)