In [2]:
import os


def read_text(file_id, base_path='data/EN/raw-documents'):
    with open(os.path.join(base_path, f'{file_id}'), 'r', encoding='utf-8') as f:
        return f.read()

In [3]:
import pandas as pd
# method to prepare a label dataset for training
def prepare_dataset(file, subnarrative):
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['file_id', 'narratives', 'subnarratives']
    
    df['narratives'] = df['narratives'].apply(lambda x: x.split(';'))
    df['subnarratives'] = df['subnarratives'].apply(lambda x: x.split(';'))

    # get only unique values
    df['narratives'] = df['narratives'].apply(lambda x: list(set(x)))
    df['subnarratives'] = df['subnarratives'].apply(lambda x: list(set(x)))
    
    df['label'] = df['subnarratives'].apply(lambda x: 1 if subnarrative in x else 0)
    df['text'] = df['file_id'].apply(lambda x: read_text(x))
    
    df = df[['text', 'label']]
    
    return df

print(os.getcwd())

# method to extract the list of all unique subnarratives
def extract_subnarratives(file):
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['file_id', 'narratives', 'subnarratives']
    
    df['subnarratives'] = df['subnarratives'].apply(lambda x: x.split(';'))
    
    subnarratives = set()
    for i in df['subnarratives']:
        subnarratives.update(i)

    return list(subnarratives)


/home/twoface/PHD-Track


In [8]:
# We load the csv for narratives 
narratives = pd.read_csv('data/narratives definition.csv', sep=',')
narratives.head()

# We load the csv for subnarratives
subnarratives = pd.read_csv('data/subnarrative definitions.csv', sep=',')
subnarratives.head()

Unnamed: 0,subnarrative,definition,examples,instruction for annotators
0,Ukraine is the aggressor,Statements that shift the responsibility of th...,Ukraine secretly provoked the war because it w...,
1,The West are the aggressors,Statements that shift the responsibility for t...,The real perpetrators were US/EU. They sabotag...,Look for direct or implied statements that men...
2,Rewriting Ukraine’s history,Statements that aim to reestablish history of ...,"Ukraine is not a real nation, it was a fabrica...",
3,Discrediting Ukrainian nation and society,Statements that aggressively undermine the leg...,The traitorous attitude is not a characteristi...,Use this only in case that the subject of the ...
4,Discrediting Ukrainian military,Statements that aim to undermine the capabilit...,Ukraine’s military lacks basic training and eq...,


In [5]:
subnarratives = extract_subnarratives('data/EN/subtask-2-annotations.txt')

# prepare dataset for each subnarrative
datasets = {}

for subnarrative in subnarratives:
    df = prepare_dataset('data/EN/subtask-2-annotations.txt', subnarrative)
    datasets[subnarrative] = {
        'df': df,
    }

# print the number of labels at true for each subnarrative
for subnarrative, item in datasets.items():
    print(f'{subnarrative}: {item["df"]['label'].sum()}')

URW: Distrust towards Media: Western media is an instrument of propaganda: 12
CC: Criticism of climate policies: Climate policies are only for profit: 9
URW: Discrediting Ukraine: Ukraine is a puppet of the West: 13
URW: Praise of Russia: Praise of Russian military might: 9
URW: Discrediting Ukraine: Discrediting Ukrainian military: 6
CC: Downplaying climate change: Temperature increase does not have significant impact: 2
URW: Discrediting Ukraine: Discrediting Ukrainian government and officials and policies: 17
URW: Discrediting Ukraine: Ukraine is a hub for criminal activities: 4
CC: Amplifying Climate Fears: Amplifying existing fears of global warming: 5
URW: Speculating war outcomes: Other: 10
CC: Questioning the measurements and science: Data shows no temperature increase: 2
URW: Blaming the war on others rather than the invader: The West are the aggressors: 27
CC: Hidden plots by secret schemes of powerful groups: Climate agenda has hidden motives: 23
URW: Amplifying war-related 

In [6]:
# function that prepares the jsonl file to finetune one agent for classification
import json


def prepare_finetuning_file(df, subnarrative, definition, example):
    system_prompt = ("You are a model trained to classify whether a text contains a specific subnarrative."
                     "You are given a text and you have to predict whether the text contains the subnarrative or not." 
                     "If the text contains the subnarrative, you should predict 1, otherwise you should predict 0. You are ONLY allowed to answer 1 or 0.")
    
    user_prompt = ("You are asked to predict whether the following text contains the subnarrative: {} "
                     "Here is a definition of the subnarrative: {} "
                     "Here are some examples that contain the subnarrative: {} "
                     "Here is the text that you have to classify: {} "
                     "Answer 1 if the text contains the subnarrative {}, otherwise answer 0.")
    
    lines = []

    for index, row in df.iterrows():
        text = row['text']
        label = row['label']
        
        line = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt.format(subnarrative, definition, example, text, subnarrative)},
                {"role": "assistant", "content": str(label)}
            ]
        }
        lines.append(line)

    with open(f'data/EN/finetuning/{subnarrative}.jsonl', 'w') as f:
        for line in lines:
            json_line = json.dumps(line)
            f.write(json_line + '\n')
