# Util functions

In [75]:
import pandas as pd
import json
import dotenv
import os


dotenv.load_dotenv()

True

In [6]:
# method to extract the list of all unique subnarratives
def get_subnarratives_list(file):
    """
    Extracts subnarratives from the nested JSON structure.
    
    Args:
        data (dict): The JSON-like dictionary containing narratives.
    
    Returns:
        list: A list of subnarratives with the hierarchy preserved in their names.
    """
    subnarratives = ["Other"]
    with open(file, 'r') as f:    
        data = json.load(f)
        for main_category, subcategories in data.items():
            for subcategory, narratives in subcategories.items():
                if "Other" not in narratives:
                    narratives.append("Other")

                for narrative in narratives:
                    subnarratives.append(f"{main_category}: {subcategory}: {narrative}")
        
    return subnarratives

def get_narratives_list(file):
    """
    Extracts narratives from the nested JSON structure.
    
    Args:
        data (dict): The JSON-like dictionary containing narratives.
    
    Returns:
        list: A list of narratives with the hierarchy preserved in their names.
    """
    narratives = ["Other"]
    with open(file, 'r') as f:    
        data = json.load(f)
        for main_category, subcategories in data.items():
            for subcategory, narrative in subcategories.items():
                narratives.append(f"{main_category}: {subcategory}")
        
    return narratives

import os
def read_text(file_id, base_path='data/EN/raw-documents'):
    with open(os.path.join(base_path, f'{file_id}'), 'r', encoding='utf-8') as f:
        return f.read()
    
def get_sibling_subnarratives(subnarrative, taxonomy):
    """
    Get all the sibling subnarratives of the given subnarrative.
    
    Args:
        subnarrative (str): The subnarrative for which to find siblings.
    
    Returns:
        list: A list of sibling subnarratives.
    """
    return taxonomy[subnarrative.split(': ')[0]][subnarrative.split(': ')[1]]

def get_narrative_definition(narrative, narratives_list):
    if narrative == 'Other':
        return 'Statements that are NOT related to anyone of these topics : {}'.format(', '.join([get_narrative_short_name(narrative) for narrative in narratives_list if narrative != 'Other']))
    narrative_definitions = pd.read_csv('data/narratives definition.csv')
    short_name = narrative.split(':')[-1].strip()
    return narrative_definitions[narrative_definitions['narrative'] == short_name]['definition'].values[0]

def get_narrative_examples(narrative):
    if narrative == 'Other':
        return None
    narrative_definitions = pd.read_csv('data/narratives definition.csv')
    short_name = narrative.split(':')[-1].strip()
    return narrative_definitions[narrative_definitions['narrative'] == short_name]['example'].values[0]

def get_narrative_short_name(narrative):
    return narrative.split(':')[-1].strip()

def get_subnarrative_definition(subnarrative, narratives_list, taxonomy):
    if subnarrative == 'Other':
        return 'Statements that are NOT related to anyone of these narratives : {}'.format(', '.join([narrative for narrative in narratives_list if narrative != 'Other']))
    short_name = subnarrative.split(':')[-1].strip()
    narrative = subnarrative.split(':')[-2].strip()
    narrative_defintion = get_narrative_definition(narrative, narratives_list)
    if short_name == 'Other':
        return 'Statement that are related to the narrative "{}", defined as {} but are not related to anyone of these subnarratives : {}'.format(get_narrative_short_name(narrative), narrative_defintion ,get_sibling_subnarratives(subnarrative, taxonomy))
    subnarrative_definitions = pd.read_csv('data/subnarrative definitions.csv')
    return subnarrative_definitions[subnarrative_definitions['subnarrative'] == short_name]['definition'].values[0]


def get_subnarrative_examples(subnarrative):
    if subnarrative == 'Other':
        return None
    short_name = subnarrative.split(':')[-1].strip()
    if short_name == 'Other':
        return None
    subnarrative_definitions = pd.read_csv('data/subnarrative definitions.csv')
    return subnarrative_definitions[subnarrative_definitions['subnarrative'] == short_name]['examples'].values[0]

def get_subnarrative_short_name(subnarrative):
    return subnarrative.split(':')[-1].strip()
        

In [83]:
def build_narratives_with_definitions_and_examples(taxonomy_file='data/taxonomy.json'):
    """
    Builds a list of narratives from the taxonomy with their definitions and examples.
    
    Args:
        taxonomy_file (str): Path to the JSON taxonomy file
        
    Returns:
        list: List of dictionaries with narrative, definition, and examples
    """
    narratives_data = {}
    
    # Get the list of narratives
    narratives_list = get_narratives_list(taxonomy_file)
    
    # Create an entry for each narrative with its definition and examples
    for narrative in narratives_list:
        narratives_data[narrative] = {
            "definition": get_narrative_definition(narrative, narratives_list),
            "examples": get_narrative_examples(narrative)
        }
    
    return narratives_data

def build_subnarratives_with_definitions_and_examples(taxonomy_file='data/taxonomy.json'):
    """
    Builds a list of subnarratives from the taxonomy with their definitions and examples.
    
    Args:
        taxonomy_file (str): Path to the JSON taxonomy file
        
    Returns:
        list: List of dictionaries with subnarrative, definition, and examples
    """
    subnarratives_data = {}
    
    # Get the list of narratives and subnarratives
    narratives_list = get_narratives_list(taxonomy_file)
    subnarratives_list = get_subnarratives_list(taxonomy_file)
    
    # Load taxonomy data to use for subnarrative context
    with open(taxonomy_file, 'r') as f:    
        taxonomy = json.load(f)
    
    # Create an entry for each subnarrative with its definition and examples
    for subnarrative in subnarratives_list:
        subnarratives_data[subnarrative] = {
            "definition": get_subnarrative_definition(subnarrative, narratives_list, taxonomy),
            "examples": get_subnarrative_examples(subnarrative)
        }
    
    return subnarratives_data

In [96]:
narratives = build_narratives_with_definitions_and_examples()
subnarratives = build_subnarratives_with_definitions_and_examples()

# Prompts

In [9]:
narrative_system_prompt = (
    "<instruction>"
    "You are a highly precise binary classification model trained to determine whether a given text explicitly relates to the narrative: '{}'. "
    "This narrative is defined as follows: ```{}```. "
    "Here are clear and representative examples of statements that are related to this narrative: ```{}```. "
    "Your task is to classify the given text strictly based on whether it contains explicit, unambiguous references to this narrative. "
    "If the text directly aligns with the narrative's definition and examples, you MUST respond with '1'. "
    "If there is any uncertainty, indirect reference, or ambiguity, you MUST respond with '0'. "
    "Output only with a single character. Your answer MUST be strictly '1' or '0'—no explanations, no justifications, and no additional text. "
    "If the narrative is only weakly implied or suggested, you MUST classify it as '0'. "
    "</instruction>"
)

narrative_user_prompt = (
    "Please classify the following text as related to the narrative '{}' or not. "
    "Text\n"
    "```{}```"
)

subnarrative_system_prompt = (
    "<instruction>"
    "You are a highly precise binary classification model trained to determine whether a given text explicitly relates to the subnarrative: '{}'. "
    "This subnarrative is defined as follows: ```{}```. "
    "Here are clear and representative examples of statements that are related to this subnarrative: ```{}```. "
    "Your task is to classify the given text strictly based on whether it contains explicit, unambiguous references to this subnarrative. "
    "Output only with a single character. If the text directly aligns with the subnarrative's definition and examples, respond with '1'. Otherwise, respond with '0'. "
    "You MUST classify as '0' if the connection is weak, indirect, implicit, or uncertain. "
    "You are STRICTLY FORBIDDEN from providing any explanation, justification, or additional text. "
    "ONLY respond with '1' or '0' based on clear and explicit evidence in the text."
    "</instruction>"
)

subnarrative_user_prompt = (
    "Please classify the following text as related to the subnarrative '{}' or not. "
    "Text\n"
    "```{}```"
)

# Creating group chats

In [53]:
import autogen

def create_narrative_agents(narratives_dict):
    import hashlib
    narrative_agents = {}
    narratives_list = get_narratives_list('data/taxonomy.json')
    llm_config={
                "config_list": [
                    {
                        "model": "gpt-4o",
                        "api_key": os.environ.get("OPENAI_API_KEY")
                    }
                ],
                'temperature': 0
            }
    for narrative, data in narratives_dict.items():
        short_hash = hashlib.md5(narrative.encode()).hexdigest()[:6]
        key = f"agent_{short_hash}"
        
        agent = autogen.AssistantAgent(
            name=key,
            system_message=narrative_system_prompt.format(narrative, data['definition'], data['examples']),
            llm_config=llm_config
        )

        agent.description = (
            "I am a classification model trained to do classify whether a given text is related to the following narrative: {}. "
            "I will be looking for {}"
        ).format(get_narrative_short_name(narrative), get_narrative_definition(narrative, narratives_list))

        narrative_agents[key] = {"agent": agent, "narrative": narrative}

    return narrative_agents
    
def create_narratives_group_chat(narrative_agents):
    narratives_user_proxy_agent = autogen.UserProxyAgent(
        name="user",
        code_execution_config=False,
        llm_config={
            "config_list": [
                {
                    "model": "gpt-4o-mini",
                    "api_key": os.environ.get("OPENAI_API_KEY")
                }
            ]
        },
        human_input_mode='NEVER'
    )

    # Extract just the agent objects from the dictionary values
    narrative_agent_objects = [agent_data["agent"] for agent_data in narrative_agents.values()]

    allowed_transitions = {}

    for agent_key in narrative_agents:
        allowed_transitions[narrative_agents[agent_key]["agent"]] = [narratives_user_proxy_agent]

    narratives_group_chat = autogen.GroupChat(
        agents= [narratives_user_proxy_agent] + narrative_agent_objects,
        messages=[],
        max_round=6,
        send_introductions=True,
        allowed_or_disallowed_speaker_transitions=allowed_transitions,
        speaker_transitions_type="disallowed",
    )

    narratives_manager = autogen.GroupChatManager(
        groupchat=narratives_group_chat,
        llm_config = {
            "config_list": [
                {
                    "model": "gpt-4o",
                    "api_key": os.environ.get("OPENAI_API_KEY")
                }
            ]
        },
    )

    return narratives_group_chat, narratives_manager, narratives_user_proxy_agent


In [129]:
def create_subnarratives_agents(subnarratives_dict):
    import hashlib
    subnarrative_agents = {}
    narratives_list = get_narratives_list('data/taxonomy.json')
    with open('data/taxonomy.json', 'r') as f:    
        taxonomy = json.load(f)
    llm_config={
                "config_list": [
                    {
                        "model": "gpt-4o",
                        "api_key": os.environ.get("OPENAI_API_KEY")
                    }
                ],
                'temperature': 0
            }
    for subnarrative, data in subnarratives_dict.items():
        short_hash = hashlib.md5(subnarrative.encode()).hexdigest()[:6]
        key = f"agent_{short_hash}"
        agent = autogen.AssistantAgent(
            name=key,
            system_message=subnarrative_system_prompt.format(subnarrative, data['definition'], data['examples']),
            llm_config=llm_config
        )

        agent.description = (
            "I am a classification model trained to do classify whether a given text is related to the following subnarrative: {}. "
            "I will be looking for {}"
        ).format(get_subnarrative_short_name(subnarrative), get_subnarrative_definition(subnarrative, narratives_list, taxonomy))

        subnarrative_agents[key] = {"agent": agent, "subnarrative": subnarrative}

    return subnarrative_agents

def create_subnarratives_group_chat(subnarrative_agents):
    subnarratives_user_proxy_agent = autogen.UserProxyAgent(
        name="user",
        code_execution_config=False,
        llm_config={
            "config_list": [
                {
                    "model": "gpt-4o-mini",
                    "api_key": os.environ.get("OPENAI_API_KEY")
                }
            ]
        },
        human_input_mode='NEVER'
    )
    
    subnarrative_agents_objects = [agent_data["agent"] for agent_data in subnarrative_agents]

    allowed_transitions = {}

    for agent in subnarrative_agents_objects:
        allowed_transitions[agent] = [subnarratives_user_proxy_agent]

    subnarrative_group_chat = autogen.GroupChat(
        agents= [subnarratives_user_proxy_agent] + subnarrative_agents_objects,
        messages=[],
        max_round=len(subnarrative_agents) + 1,
        send_introductions=True,
        allowed_or_disallowed_speaker_transitions=allowed_transitions,
        speaker_transitions_type='disallowed',
    )

    subnarrative_manager = autogen.GroupChatManager(
        groupchat=subnarrative_group_chat,
        llm_config = {
            "config_list": [
                {
                    "model": "gpt-4o",
                    "api_key": os.environ.get("OPENAI_API_KEY")
                }
            ]
        },
        system_message="You are a group chat manager. You are asked to give the classification task to the agents that look relevant to the topic"
    )
    
    return subnarrative_group_chat, subnarrative_manager, subnarratives_user_proxy_agent


In [130]:
def group_subnarrative_agents_by_narrative(subnarrative_agents):
    """
    Groups subnarrative agents by their corresponding narrative.
    
    Args:
        subnarrative_agents (list): List of subnarrative agents.
        
    Returns:
        dict: Dictionary where keys are narratives and values are lists of corresponding subnarrative agents.
    """
    grouped_agents = {}
    

    for agent_data in subnarrative_agents.values():
        subnarrative = agent_data["subnarrative"]
        if subnarrative == 'Other':
            continue
        narrative = subnarrative.split(': ')[0] + ': ' + subnarrative.split(': ')[1]
        if narrative not in grouped_agents:
            grouped_agents[narrative] = []
        grouped_agents[narrative].append(agent_data)
    
    return grouped_agents

In [131]:
def create_groupchat_for_narrative(narrative, grouped_agents):
    return create_subnarratives_group_chat(grouped_agents[narrative])

In [132]:
def reset_agents(agent_dict):
    for agent_key, agent_data in agent_dict.items():
        agent_data["agent"].reset()

# Narrative extraction

In [154]:
def extract_recognized_narratives(chat_history, narrative_agents):
    recognized_narratives = []
    for message in chat_history:
        if message['name'] == 'user' or message['name'] == 'chat_manager':
            continue
        narrative = narrative_agents[message['name']]['narrative']
        if message['content'] == '1':
            recognized_narratives.append(narrative)
    if len(recognized_narratives) == 0:
        recognized_narratives.append('Other')

    if len(recognized_narratives) > 1 and 'Other' in recognized_narratives:
        recognized_narratives.remove('Other')
    return recognized_narratives

def extract_recognized_subnarratives(chat_history, subnarrative_agents):
    recognized_subnarratives = []
    for message in chat_history:
        if message['name'] == 'user' or message['name'] == 'chat_manager':
            continue
        subnarrative = subnarrative_agents[message['name']]['subnarrative']
        if subnarrative != 'Other':
            narrative = subnarrative.split(': ')[0] + ': ' + subnarrative.split(': ')[1]
        if message['content'] == '1':
            recognized_subnarratives.append(subnarrative)
    if len(recognized_subnarratives) == 0:
        recognized_subnarratives.append(f"{narrative}: Other")

    if len(recognized_subnarratives) > 1 and 'Other' in recognized_subnarratives:
        recognized_subnarratives.remove('Other')
    return recognized_subnarratives

In [152]:
def extract_subnarratives_for_one_narrative(narrative, text, grouped_agents, subnarrative_agents):
    _ , manager, user_proxy_agent = create_groupchat_for_narrative(narrative, grouped_agents)
    # We get the subnarratives that belong to the narrative
    chat_result = user_proxy_agent.initiate_chat(
        manager,
        message="Here is the text that needs to be classified: \n```{}```\nYou are ONLY allowed to reply with '0' or '1'".format(text),
        summary_method='reflection_with_llm'
    )
    return extract_recognized_subnarratives(chat_result.chat_history, subnarrative_agents)

def extract_subnarratives_for_narratives(narratives_list, text, grouped_agents, subnarrative_agents):
    # if the list only has one element that is 'Other', return 'Other'
    if len(narratives_list) == 1 and narratives_list[0] == 'Other':
        return ['Other']
    subnarratives = []
    for narrative in narratives_list:
        subnarratives.append(extract_subnarratives_for_one_narrative(narrative, text, grouped_agents, subnarrative_agents))
    return subnarratives

In [143]:
narrative_agents = create_narrative_agents(narratives)
narratives_group_chat, narratives_manager, narratives_user_proxy_agent = create_narratives_group_chat(narrative_agents)

In [144]:
subnarrative_agents = create_subnarratives_agents(subnarratives)

In [145]:
grouped_agents = group_subnarrative_agents_by_narrative(subnarrative_agents)

In [None]:
def classify_text_with_narratives(text, narrative_agents, narratives_manager, narratives_user_proxy_agent):
    """
    Classifies the text using narrative agents.
    
    Args:
        text (str): The text to classify.
        narratives_group_chat (GroupChat): The group chat for narrative agents.
        narratives_manager (GroupChatManager): The manager for the group chat.
        narratives_user_proxy_agent (UserProxyAgent): The user proxy agent for the group chat.
        
    Returns:
        list: List of recognized narratives.
    """
    chat_result = narratives_user_proxy_agent.initiate_chat(
        narratives_manager,
        message="Here is the text that needs to be classified: \n```{}```\n".format(text),
        summary_method='reflection_with_llm'
    ) 

    history = chat_result.chat_history
    narratives = extract_recognized_narratives(history, narrative_agents)
    reset_agents(narrative_agents)
    narratives_group_chat.reset()
    narratives_user_proxy_agent.reset()

    return narratives


In [147]:
def read_text(file_id, base_path='data/EN/raw-documents'):
    with open(os.path.join(base_path, f'{file_id}'), 'r', encoding='utf-8') as f:
        return f.read()

In [148]:
text = read_text('CC_TEST_00000.txt', base_path='testset/EN/subtask-2-documents')

In [149]:
recognized_narratives = classify_text_with_narratives(text, narrative_agents, narratives_manager, narratives_user_proxy_agent)

[33muser[0m (to chat_manager):

Here is the text that needs to be classified: 
```‘Absolute Genius’: How Three Alarmist Billionaires Bankrolled The Fake Climate Catastrophe 

 Do you think that the constant catastrophizing of weather and climate in the mainstream media, politics, and science has just appeared by accident? [emphasis, links added]

Over the last few years, the BBC and the Guardian, [working] as of one mind, decided to float improbable ‘tipping point’ scares under cover of ‘scientists say’, while UN officials concluded that we had two years to save a ‘boiling’ planet and the ubiquitous ‘Jim’ Dale has been given free rein to make it up as he goes along on Talk TV and GB News.

Of course, all this didn’t suddenly happen.

Each of these examples is a testament to an extraordinary corruption of the true scientific process – an “amazing tale” according to political science writer Roger Pielke Jr., “a story of how wealth and power sought to shape climate science in pursuit of

In [155]:
subnarratives = extract_subnarratives_for_narratives(recognized_narratives, text, grouped_agents, subnarrative_agents)


[33muser[0m (to chat_manager):

Here is the text that needs to be classified: 
```‘Absolute Genius’: How Three Alarmist Billionaires Bankrolled The Fake Climate Catastrophe 

 Do you think that the constant catastrophizing of weather and climate in the mainstream media, politics, and science has just appeared by accident? [emphasis, links added]

Over the last few years, the BBC and the Guardian, [working] as of one mind, decided to float improbable ‘tipping point’ scares under cover of ‘scientists say’, while UN officials concluded that we had two years to save a ‘boiling’ planet and the ubiquitous ‘Jim’ Dale has been given free rein to make it up as he goes along on Talk TV and GB News.

Of course, all this didn’t suddenly happen.

Each of these examples is a testament to an extraordinary corruption of the true scientific process – an “amazing tale” according to political science writer Roger Pielke Jr., “a story of how wealth and power sought to shape climate science in pursuit of

In [160]:
def classify_text(text, narrative_agents, narrative_manager, narrative_proxy_agent, grouped_agents, subnarrative_agents):
    """
    Classifies the text using narrative agents and subnarrative agents.
    
    Args:
        text (str): The text to classify.
        narrative_agents (dict): Dictionary of narrative agents.
        narrative_manager (GroupChatManager): The manager for the group chat.
        narrative_proxy_agent (UserProxyAgent): The user proxy agent for the group chat.
        grouped_agents (dict): Dictionary of grouped subnarrative agents.
        subnarrative_agents (dict): Dictionary of subnarrative agents.
        
    Returns:
        tuple: List of recognized narratives and subnarratives.
    """
    def concat_arrays(arr):
        if len(arr) == 0:
            return []
        if len(arr) == 1:
            return arr[0]
        return [item for sublist in arr for item in sublist]
    
    recognized_narratives = classify_text_with_narratives(text, narrative_agents, narrative_manager, narrative_proxy_agent)
    subnarratives = extract_subnarratives_for_narratives(recognized_narratives, text, grouped_agents, subnarrative_agents)
    subnarratives = concat_arrays(subnarratives)
    return recognized_narratives, subnarratives

In [179]:
def classify_all_texts(folder = 'testset/PT/subtask-2-documents/translated', n = 10):
    """
    Classifies all texts in the specified folder, stopping after 10 files.
    
    Args:
        folder (str): Path to the folder containing text files.
        
    Returns:
        DataFrame: DataFrame containing recognized narratives and subnarratives for each text.
    """
    import pandas as pd
    
    results = []
    count = 0
    narratives = build_narratives_with_definitions_and_examples()
    subnarratives = build_subnarratives_with_definitions_and_examples()
    narrative_agents = create_narrative_agents(narratives)
    narratives_group_chat, narratives_manager, narratives_user_proxy_agent = create_narratives_group_chat(narrative_agents)
    subnarrative_agents = create_subnarratives_agents(subnarratives)
    grouped_agents = group_subnarrative_agents_by_narrative(subnarrative_agents)
    for file in os.listdir(folder):
        if file.endswith('.txt'):
            text = read_text(file, base_path=folder)
            recognized_narratives, subnarratives = classify_text(text, narrative_agents, narratives_manager, narratives_user_proxy_agent, grouped_agents, subnarrative_agents)
            results.append({'id': file, 'narratives': recognized_narratives, 'subnarratives': subnarratives})
            
            count += 1
            print(f"Processed {count}/10 files")
            
            if count >= n:
                break
    
    return pd.DataFrame(results)

In [168]:
df = classify_all_texts(folder='testset/PT/subtask-2-documents/translated', n=100)

[33muser[0m (to chat_manager):

Here is the text that needs to be classified: 
```Russian ambassador to Portugal responds to Cravinho and denies Moscow's ambition to destroy the EU

"Russia has never sought to 'destroy the European project', as EU officials have been wrongly convinced," wrote the Russian ambassador to Portugal in a post on the Facebook page of the Russian Embassy in Portugal.

In response to an interview with the Portuguese Minister of Foreign Affairs, the Russian ambassador in Lisbon, Mikhail Kamynin, denied on Thursday that Moscow intends to destroy the European Union and accused European leaders of "wasting a billion euros" to militarize Kiev. Kamynin also accused the West of being responsible for the current situation in Ukraine.

The comment comes in response to statements made by João Gomes Cravinho to the Lusa news agency on Tuesday, who accused Putin of "not only wanting a piece of Ukrainian territory", but also "destroying the European project".

"The histor

In [None]:
def remove_duplicates_from_column(df, column_name):
    """
    Remove duplicates from a specified column in the DataFrame. 
    The column contains either a list or the string "Other".
    The string "Other" should not be modified.
    
    Args:
        df (DataFrame): DataFrame containing the data
        column_name (str): Name of the column to process
        
    Returns:
        DataFrame: A copy of the input DataFrame with duplicates removed from the specified column
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    
    # Define a function to process each cell
    def clean_cell(cell):
        if isinstance(cell, str) and cell == "Other":
            # Don't modify "Other"
            return cell
        elif isinstance(cell, list):
            # Remove duplicates from the list while preserving order
            seen = set()
            return [x for x in cell if not (x in seen or seen.add(x))]
        else:
            # Return cell as is if it's neither a list nor "Other"
            return cell
    
    # Apply the function to the specified column
    df_copy[column_name] = df_copy[column_name].apply(clean_cell)
    
    return df_copy

def export_df_to_txt(df, output_file='classified_texts.txt'):
    """
    Export DataFrame to a text file without headers, with list elements joined by semicolons.
    
    Args:
        df (DataFrame): DataFrame containing classification results
        output_file (str): Path to the output file
    """
    df.sort_values(by="id").reset_index(drop=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            # Convert narratives to semicolon-separated string
            narratives_str = ";".join(row['narratives'])
            
            # Handle subnarratives which might be a nested list
            subnarratives = row['subnarratives']
            
            # If it's a nested list, flatten it
            if isinstance(subnarratives, list) and any(isinstance(item, list) for item in subnarratives):
                flat_subnarratives = []
                for item in subnarratives:
                    if isinstance(item, list):
                        flat_subnarratives.extend(item)
                    else:
                        flat_subnarratives.append(item)
                subnarratives = flat_subnarratives
                
            if subnarratives == "Other":
                subnarratives_str = "Other"
            else:
                subnarratives_str = ";".join(subnarratives)
            
            # Write the row to file (tab-separated)
            f.write(f"{row['id']}\t{narratives_str}\t{subnarratives_str}\n")
    
    print(f"DataFrame exported to {output_file}")

In [184]:
clean_df = remove_duplicates_from_column(df, 'subnarratives')
clean_df = clean_df.sort_values(by='id').reset_index(drop=True)


In [185]:

export_df_to_txt(clean_df, output_file='PT-subtask-2-annotation.txt')

DataFrame exported to PT-subtask-2-annotation.txt


In [180]:
ru_df = classify_all_texts(folder='testset/RU/subtask-2-documents/translated', n=60)

[33muser[0m (to chat_manager):

Here is the text that needs to be classified: 
```"Not his level". Arestovich: Ukraine will not be able to defeat Russia

Ukraine with its current government has no chance to win an armed conflict with Russia. Moscow, unlike Kiev, has brought its army to a new level. Only the rejection of the state ideology of nationalism will help to save itself, said Alexei Arestovich, former advisor to the head of the Office (administration) of the President of Ukraine, * in an interview with journalist Yulia Latynina. "What remains inexplicable is that in two and a half years [since the start of the conflict] the governability and efficiency of the state has declined. The same happened with the army - we started fighting worse than at the beginning of the war. Yes, we have more equipment and shells, but this is only a small compensation for our organizational failures to hold the front line," the politician criticized the country's leadership.

"During this time we

In [186]:
clean_ru_df = remove_duplicates_from_column(ru_df, 'subnarratives')
clean_ru_df = clean_ru_df.sort_values(by='id').reset_index(drop=True)
export_df_to_txt(clean_ru_df, output_file='RU-subtask-2-annotation.txt')

DataFrame exported to RU-subtask-2-annotation.txt


In [79]:
import deepl

deepl_key = os.environ.get("DEEPL_API_KEY")
translator = deepl.Translator(deepl_key)

In [173]:
def translate_files(source_folder, target_folder, source_lang="PT", target_lang="EN-US"):
    """
    Translates all text files from one language to another using DeepL API.
    
    Args:
        source_folder (str): Path to folder containing source files
        target_folder (str): Path to save translated files
        source_lang (str): Source language code (e.g., "PT" for Portuguese)
        target_lang (str): Target language code (e.g., "EN-US" for English)
        
    Returns:
        tuple: (success_count, total_files) - Number of successfully translated files and total files processed
    """
    # Create the target folder if it doesn't exist
    os.makedirs(target_folder, exist_ok=True)
    
    # Get all files in the source folder (excluding the target folder itself if it's a subfolder)
    files = [f for f in os.listdir(source_folder) 
             if os.path.isfile(os.path.join(source_folder, f))]
    
    print(f"Found {len(files)} files to translate from {source_lang} to {target_lang}.")
    count = 0
    
    # Process each file
    for file_name in files:
        try:
            # Read the source text
            file_path = os.path.join(source_folder, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                source_text = f.read()
            
            # Translate to target language using DeepL
            translated_text = translator.translate_text(
                source_text, 
                source_lang=source_lang, 
                target_lang=target_lang
            ).text
            
            # Save the translated text
            output_path = os.path.join(target_folder, file_name)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(translated_text)
            
            count += 1
            if count % 10 == 0:
                print(f"Processed {count}/{len(files)} files")
        
        except Exception as e:
            print(f"Error translating {file_name}: {str(e)}")
    
    print(f"Translation completed. Successfully processed {count}/{len(files)} files.")
    return count, len(files)

In [174]:
count, total_files = translate_files(
    source_folder='testset/RU/subtask-2-documents',
    target_folder='testset/RU/subtask-2-documents/translated',
    source_lang="RU",
    target_lang="EN-US"
)

Found 60 files to translate from RU to EN-US.
Processed 10/60 files
Processed 20/60 files
Processed 30/60 files
Processed 40/60 files
Processed 50/60 files
Processed 60/60 files
Translation completed. Successfully processed 60/60 files.
