In [13]:
import os


def read_text(file_id, base_path='data/EN/raw-documents'):
    with open(os.path.join(base_path, f'{file_id}'), 'r', encoding='utf-8') as f:
        return f.read()

In [14]:
import json
import pandas as pd
# method to prepare a label dataset for training
def prepare_annotations_df_for_subnarrative(file, subnarrative):
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['file_id', 'narratives', 'subnarratives']
    
    df['narratives'] = df['narratives'].apply(lambda x: x.split(';'))
    df['subnarratives'] = df['subnarratives'].apply(lambda x: x.split(';'))

    # get only unique values
    df['narratives'] = df['narratives'].apply(lambda x: list(set(x)))
    df['subnarratives'] = df['subnarratives'].apply(lambda x: list(set(x)))
    
    df['label'] = df['subnarratives'].apply(lambda x: 1 if subnarrative in x else 0)
    df['text'] = df['file_id'].apply(lambda x: read_text(x))
    
    df = df[['text', 'label']]
    
    return df


# method to extract the list of all unique subnarratives
def get_subnarratives_list(file):
    """
    Extracts subnarratives from the nested JSON structure.
    
    Args:
        data (dict): The JSON-like dictionary containing narratives.
    
    Returns:
        list: A list of subnarratives with the hierarchy preserved in their names.
    """
    subnarratives = ["Other"]
    with open(file, 'r') as f:    
        data = json.load(f)
        for main_category, subcategories in data.items():
            for subcategory, narratives in subcategories.items():
                if "Other" not in narratives:
                    narratives.append("Other")

                for narrative in narratives:
                    subnarratives.append(f"{main_category}: {subcategory}: {narrative}")
        
    return subnarratives

print(get_subnarratives_list('data/taxonomy.json'))
subnarratives_list = get_subnarratives_list('data/taxonomy.json')
    

['Other', 'URW: Blaming the war on others rather than the invader: Ukraine is the aggressor', 'URW: Blaming the war on others rather than the invader: The West are the aggressors', 'URW: Blaming the war on others rather than the invader: Other', 'URW: Discrediting Ukraine: Rewriting Ukraine’s history', 'URW: Discrediting Ukraine: Discrediting Ukrainian nation and society', 'URW: Discrediting Ukraine: Discrediting Ukrainian military', 'URW: Discrediting Ukraine: Discrediting Ukrainian government and officials and policies', 'URW: Discrediting Ukraine: Ukraine is a puppet of the West', 'URW: Discrediting Ukraine: Ukraine is a hub for criminal activities', 'URW: Discrediting Ukraine: Ukraine is associated with nazism', 'URW: Discrediting Ukraine: Situation in Ukraine is hopeless', 'URW: Discrediting Ukraine: Other', 'URW: Russia is the Victim: The West is russophobic', 'URW: Russia is the Victim: Russia actions in Ukraine are only self-defence', 'URW: Russia is the Victim: UA is anti-RU e

In [43]:
def prepare_sampled_annotations_df(file, narrative, subnarrative, size=50, min_positives=4):
    # 1. Read file
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['file_id', 'narratives', 'subnarratives']
    
    # 2. Split strings into lists; remove duplicates
    df['narratives'] = df['narratives'].apply(lambda x: list(set(x.split(';'))))
    df['subnarratives'] = df['subnarratives'].apply(lambda x: list(set(x.split(';'))))
    
    # 3. Create a label column (1 if `subnarrative` is in the row, else 0)
    df['label'] = df['subnarratives'].apply(lambda x: 1 if subnarrative in x else 0)
    
    # 4. Read full text from `file_id` (assuming you have `read_text` defined)
    df['text'] = df['file_id'].apply(read_text)
    
    # 5. Keep only necessary columns
    df = df[['text', 'label', 'narratives']]
    
    # --------------------------------------------------
    #    SAMPLING LOGIC TO GET EXACTLY 50 ROWS
    # --------------------------------------------------
    
    # a) Separate rows that contain the given narrative
    df_narrative = df[df['narratives'].apply(lambda x: narrative in x)]
    
    # b) Within that subset, split into positives (label=1) and negatives (label=0)
    df_narr_pos = df_narrative[df_narrative['label'] == 1]
    df_narr_neg = df_narrative[df_narrative['label'] == 0]
    
    # c) We’ll aim for at least `min_positives` positives overall
    needed_pos = min_positives
    available_pos_narr = len(df_narr_pos)
    
    # If the narrative subset doesn't have enough positives, we'll pull some from outside
    if available_pos_narr >= needed_pos:
        chosen_pos = df_narr_pos.sample(n=needed_pos, random_state=42)
    else:
        chosen_pos = df_narr_pos  # take all it has
        # Pull remaining positives from outside the narrative, if available
        df_others_pos = df[(df['label'] == 1) & (~df.index.isin(df_narr_pos.index))]
        still_needed = needed_pos - available_pos_narr
        if len(df_others_pos) >= still_needed:
            chosen_pos = pd.concat([chosen_pos, df_others_pos.sample(n=still_needed, random_state=42)])
        else:
            chosen_pos = pd.concat([chosen_pos, df_others_pos])  # take all positives you can find
    
    # d) After choosing positives, fill up with negatives
    # We want a total of 50. So how many negatives do we need?
    chosen_pos_count = len(chosen_pos)
    needed_neg = size - chosen_pos_count
    if needed_neg < 0:
        needed_neg = 0  # just in case we somehow have more positives than 50
    
    available_neg_narr = len(df_narr_neg)
    if available_neg_narr >= needed_neg:
        chosen_neg = df_narr_neg.sample(n=needed_neg, random_state=42)
    else:
        chosen_neg = df_narr_neg  # take all from narrative
        df_others_neg = df[(df['label'] == 0) & (~df.index.isin(df_narr_neg.index))]
        still_needed_neg = needed_neg - available_neg_narr
        if len(df_others_neg) >= still_needed_neg:
            chosen_neg = pd.concat([chosen_neg, df_others_neg.sample(n=still_needed_neg, random_state=42)])
        else:
            chosen_neg = pd.concat([chosen_neg, df_others_neg])  # whatever is left
    
    # e) Combine positives & negatives, then if we somehow exceeded 50, take a final random sample of 50
    final_df = pd.concat([chosen_pos, chosen_neg])
    if len(final_df) > size:
        final_df = final_df.sample(n=size, random_state=42)
    
    return final_df.reset_index(drop=True)

sample_df = prepare_sampled_annotations_df('data/EN/subtask-2-annotations.txt', 'CC: Controversy about green technologies', 'CC: Controversy about green technologies: Nuclear energy is not climate friendly', size=50, min_positives=3)

In [30]:
import pandas as pd

def load_subnarrative_details(file_path):
    subnarrative_df = pd.read_csv(file_path, encoding='utf-8-sig')
    subnarrative_info = {}
    for _, row in subnarrative_df.iterrows():
        subnarrative = row['subnarrative']
        definition = row['definition']
        examples = row['examples']
        instructions = row['instruction for annotators']
        subnarrative_info[subnarrative] = {
            'definition': definition,
            'examples': examples,
            'instructions': instructions
        }
    return subnarrative_info


subnarratives = load_subnarrative_details('data/subnarrative definitions.csv')

In [37]:
import json

def build_subnarratives_dfs(subnarrative_list, subnarratives_infos):
    subnarratives_dfs = {}
    taxonomy = json.load(open('data/taxonomy.json'))
    narratives = pd.read_csv('data/narratives definition.csv', encoding='utf-8')
    narratives = list(narratives['narrative'])

    for subnarrative in subnarrative_list:

        if subnarrative == 'Other':
            short_name = 'Other'
            definition = 'Anything that is not related to any of the narratives in the taxonomy : {}'.format(narratives)
            examples = None
            instructions = None
            df = prepare_annotations_df_for_subnarrative('data/EN/subtask-2-annotations.txt', subnarrative)
        
        else:
            df = prepare_annotations_df_for_subnarrative('data/EN/subtask-2-annotations.txt', subnarrative)
            category = subnarrative.split(': ')[0]
            narrative = subnarrative.split(': ')[1]
            sibling_subnarratives = taxonomy[category][narrative]
            print('Analysing subnarrative:', subnarrative)
            print('Siblings: ', sibling_subnarratives)

            short_name = subnarrative.split(': ')[-1]

            if short_name == 'Other':
                definition = 'Anything that is not related to any of the subnarratives belonging to the narrative: {}. Here are the subnarratives: {}'.format(narrative, sibling_subnarratives)
                examples = None
                instructions = None
            else:
                definition = subnarratives_infos[short_name]['definition']
                examples = subnarratives_infos[short_name]['examples']
                instructions = subnarratives_infos[short_name]['instructions']

        # escape special characters (/)
        short_name = short_name.replace('/', ' ')
        subnarrative = subnarrative.replace('/', ' ')      
        subnarratives_dfs[subnarrative] = {
            'df': df,
            'short_name': short_name,
            'definition': definition,
            'examples': examples,
            'instructions': instructions
        }
    return subnarratives_dfs

subnarratives_dfs = build_subnarratives_dfs(subnarratives_list, subnarratives)



Analysing subnarrative: URW: Blaming the war on others rather than the invader: Ukraine is the aggressor
Siblings:  ['Ukraine is the aggressor', 'The West are the aggressors']
Analysing subnarrative: URW: Blaming the war on others rather than the invader: The West are the aggressors
Siblings:  ['Ukraine is the aggressor', 'The West are the aggressors']
Analysing subnarrative: URW: Blaming the war on others rather than the invader: Other
Siblings:  ['Ukraine is the aggressor', 'The West are the aggressors']
Analysing subnarrative: URW: Discrediting Ukraine: Rewriting Ukraine’s history
Siblings:  ['Rewriting Ukraine’s history', 'Discrediting Ukrainian nation and society', 'Discrediting Ukrainian military', 'Discrediting Ukrainian government and officials and policies', 'Ukraine is a puppet of the West', 'Ukraine is a hub for criminal activities', 'Ukraine is associated with nazism', 'Situation in Ukraine is hopeless']
Analysing subnarrative: URW: Discrediting Ukraine: Discrediting Ukrain

In [38]:
# function that prepares the jsonl file to finetune one agent for classification
import json


def prepare_finetuning_file(df, subnarrative, definition, example, instructions):
    system_prompt = ("You are a model trained to classify whether a text contains a specific subnarrative."
                     "You are given a text and you have to predict whether the text contains the subnarrative or not." 
                     "If the text contains the subnarrative, you should predict 1, otherwise you should predict 0. You are ONLY allowed to answer 1 or 0.")
    
    user_prompt = ("You are asked to predict whether the following text contains the subnarrative: {} "
                     "Here is a definition of the subnarrative: {} "
                     "Here are some examples that contain the subnarrative: {} "
                     "Here is the text that you have to classify: {} "
                     "Answer 1 if the text contains the subnarrative {}, otherwise answer 0.")
    
    if instructions:
        user_prompt += f" {instructions}"
    
    lines = []

    for index, row in df.iterrows():
        text = row['text']
        label = row['label']
        
        line = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt.format(subnarrative, definition, example, text, subnarrative)},
                {"role": "assistant", "content": str(label)}
            ]
        }
        lines.append(line)

    with open(f'data/EN/finetuning/{subnarrative}.jsonl', 'w') as f:
        for line in lines:
            json_line = json.dumps(line)
            f.write(json_line + '\n')

In [39]:
for subnarrative, data in subnarratives_dfs.items():
    df = data['df']
    definition = data['definition']
    examples = data['examples']
    instructions = data['instructions']
    prepare_finetuning_file(df, subnarrative, definition, examples, instructions)

In [48]:
from openai import OpenAI

client = OpenAI()

file = client.files.create(
  file=open("data/EN/finetuning/CC: Criticism of climate movement: Climate movement is alarmist.jsonl", "rb"),
  purpose="fine-tune"
)

job = client.fine_tuning.jobs.create(
    training_file= file.id,
    model='gpt-4o-mini-2024-07-18',
)

In [49]:
jobs = client.fine_tuning.jobs.list()
print(jobs)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-b0Kwcy0m52EgNcR5ejIVsKHe', created_at=1737726804, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-lx9QrY1DHi9AbhRjJo7F2vID', result_files=[], seed=1022822895, status='validating_files', trained_tokens=None, training_file='file-XHYQxHy6fx9unw4sSNftFH', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix=None)], object='list', has_more=False)
