# Running inference on the devset

In [None]:
import pandas as pd
from tqdm.auto import tqdm
import os
import sys


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
from src.inference.narrative_predictor import NarrativePredictor

MODEL_PATH = 'models/phase0_xlmr_best_model.bin'
TOKENIZER_NAME = 'xlm-roberta-base'
TEST_ARTICLES_PATH = 'devset/EN/subtask-2-documents/'
OUTPUT_FILE = 'devset/en_predictions.txt'
OPTIMAL_THRESHOLD = 0.87

In [None]:
def load_articles(folder_path):
    """Loads all .txt files from a folder."""
    articles = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                articles.append({'article_id': filename, 'text': f.read()})
    return pd.DataFrame(articles)

In [None]:
from src.data_management.label_parser import get_label_mappings

label_to_id, id_to_label, narrative_to_subnarrative_ids = get_label_mappings()
sub_to_narr_id_map = {}

# Create a mapping from sub-narrative IDs to their parent narrative IDs
for narr_id, sub_ids_list in narrative_to_subnarrative_ids.items():
    for sub_id in sub_ids_list:
        sub_to_narr_id_map[sub_id] = narr_id


parent_child_pairs = list(sub_to_narr_id_map.items())

In [None]:
label_maps = {
        "id2label": id_to_label,
        "label2id": label_to_id,
        "parent_child_pairs": parent_child_pairs
    }

# --- 2. Initialize the Predictor ---
# This loads the model and tokenizer only once.
predictor = NarrativePredictor(MODEL_PATH, TOKENIZER_NAME, label_maps)

# --- 3. Set the Optimal Threshold ---
predictor.set_threshold(OPTIMAL_THRESHOLD)

In [None]:
print(f"Loading articles from {TEST_ARTICLES_PATH}...")
df_test = load_articles(TEST_ARTICLES_PATH)
texts_to_predict = df_test['text'].tolist()

In [None]:
predictions = predictor.predict_batch(texts_to_predict)

df_test['prediction'] = predictions

df_test['narratives'] = df_test['prediction'].apply(lambda x: x['narratives'])
df_test['subnarratives'] = df_test['prediction'].apply(lambda x: x['subnarratives'])


df_test['narratives'] = df_test['narratives'].apply(lambda x: ';'.join([str(narr) for narr in x]) if isinstance(x, list) else '')
df_test['subnarratives'] = df_test['subnarratives'].apply(lambda x: ';'.join([str(subnarr) for subnarr in x]) if isinstance(x, list) else '')

df_test.head()
df_test[['article_id', 'narratives', 'subnarratives']].to_csv(OUTPUT_FILE, index=False, sep='\t', header=False)

# Running inference on the testset

In [None]:
import pandas as pd
from tqdm.auto import tqdm
import os
import sys


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
from src.inference.narrative_predictor import NarrativePredictor

MODEL_PATH = 'models/phase0_xlmr_continual_learning_model.bin'
TOKENIZER_NAME = 'xlm-roberta-base'
TEST_ARTICLES_PATH_EN = 'testset/EN/subtask-2-documents/'
DEV_ARTICLES_PATH_EN = 'devset/EN/subtask-2-documents/'
OUTPUT_FILE = 'testset/en_predictions.txt'
NARRATIVE_THRESHOLD = 0.88
SUBNARRATIVE_THRESHOLD = 0.80

In [None]:
def load_articles(folder_path):
    """Loads all .txt files from a folder."""
    articles = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                articles.append({'article_id': filename, 'text': f.read()})
    return pd.DataFrame(articles)

In [None]:
from src.data_management.label_parser import get_label_mappings

label_to_id, id_to_label, narrative_to_subnarrative_ids = get_label_mappings()
sub_to_narr_id_map = {}

# Create a mapping from sub-narrative IDs to their parent narrative IDs
for narr_id, sub_ids_list in narrative_to_subnarrative_ids.items():
    for sub_id in sub_ids_list:
        sub_to_narr_id_map[sub_id] = narr_id


parent_child_pairs = list(sub_to_narr_id_map.items())

In [None]:
label_maps = {
        "id2label": id_to_label,
        "label2id": label_to_id,
        "parent_child_pairs": parent_child_pairs
    }

# --- 2. Initialize the Predictor ---
# This loads the model and tokenizer only once.
predictor = NarrativePredictor(MODEL_PATH, TOKENIZER_NAME, label_maps)


In [None]:

# --- 3. Set the Optimal Threshold ---
predictor.set_thresholds(NARRATIVE_THRESHOLD, SUBNARRATIVE_THRESHOLD)

In [None]:
print(f"Loading articles from {TEST_ARTICLES_PATH_EN}...")
df_test = load_articles(TEST_ARTICLES_PATH_EN)
texts_to_predict = df_test['text'].tolist()

In [None]:
predictions = predictor.predict_batch(texts_to_predict)

df_test['prediction'] = predictions

df_test['narratives'] = df_test['prediction'].apply(lambda x: x['narratives'])
df_test['subnarratives'] = df_test['prediction'].apply(lambda x: x['subnarratives'])


df_test['narratives'] = df_test['narratives'].apply(lambda x: ';'.join([str(narr) for narr in x]) if isinstance(x, list) else '')
df_test['subnarratives'] = df_test['subnarratives'].apply(lambda x: ';'.join([str(subnarr) for subnarr in x]) if isinstance(x, list) else '')

df_test.head()
df_test[['article_id', 'narratives', 'subnarratives']].to_csv(OUTPUT_FILE, index=False, sep='\t', header=False)

In [None]:
TEST_ARTICLES_PATH_RU = 'testset/RU/subtask-2-documents/'
DEV_ARTICLES_PATH_RU = 'devset/RU/subtask-2-documents/'

In [None]:
df_test_ru = load_articles(TEST_ARTICLES_PATH_RU)
texts_to_predict_ru = df_test_ru['text'].tolist()

In [None]:
predictions_ru = predictor.predict_batch(texts_to_predict_ru)

df_test_ru['prediction'] = predictions_ru
df_test_ru['narratives'] = df_test_ru['prediction'].apply(lambda x: x['narratives'])
df_test_ru['subnarratives'] = df_test_ru['prediction'].apply(lambda x: x['subnarratives'])
df_test_ru['narratives'] = df_test_ru['narratives'].apply(lambda x: ';'.join([str(narr) for narr in x]) if isinstance(x, list) else '')
df_test_ru['subnarratives'] = df_test_ru['subnarratives'].apply(lambda x: ';'.join([str(subnarr) for subnarr in x]) if isinstance(x, list) else '')
df_test_ru.head()
df_test_ru[['article_id', 'narratives', 'subnarratives']].to_csv('testset/ru_predictions.txt', index=False, sep='\t', header=False)