# Loading the data

In [1]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
from src.data_management.label_parser import parse_json_for_narratives_subnarratives, create_label_mappings

taxonomy_path = 'data/taxonomy.json'
df = pd.read_parquet('data/processed/phase0_baseline.parquet')

all_narratives, all_subnarratives = parse_json_for_narratives_subnarratives(taxonomy_path)
label_to_id, id_to_label, narrative_to_subnarrative_ids = create_label_mappings(all_narratives, all_subnarratives)


# convert the numpy arrays to lists
df['narratives'] = df['narratives'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['subnarratives'] = df['subnarratives'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['narrative_ids'] = df['narrative_ids'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['subnarrative_ids'] = df['subnarrative_ids'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Adding the bit vector labels to the data

In [3]:
from src.data_management.preprocessor import binarize_labels

# concat the narratives and subnarratives
df['labels'] = df.apply(lambda row: row['narrative_ids'] + row['subnarrative_ids'], axis=1)
# binarize the labels
df['labels'] = df['labels'].apply(lambda x: binarize_labels(x, df['narrative_ids'].explode().unique()))
# show the first 5 rows
print(df.head())



                   id                                               text  \
0          BG_670.txt  Опитът на колективния Запад да „обезкърви Руси...   
1  A7_URW_BG_4793.txt  Цончо Ганев, “Възраждане”: Обещали сме на Укра...   
2         BG_3245.txt  Подкрепата за Киев от страна на Запада вече не...   
3      A9_BG_5190.txt  Дмитрий Медведев: НПО-та, спонсорирани от Соро...   
4      A9_BG_3379.txt  Британски дипломат обвини Запада за украинския...   

                                          narratives  \
0  [URW: Blaming the war on others rather than th...   
1                        [URW: Discrediting Ukraine]   
2  [URW: Discrediting the West, Diplomacy, URW: D...   
3  [URW: Discrediting the West, Diplomacy, URW: D...   
4  [URW: Discrediting the West, Diplomacy, URW: P...   

                                       subnarratives language narrative_ids  \
0  [URW: Blaming the war on others rather than th...       BG  [11, 12, 14]   
1  [URW: Discrediting Ukraine: Situation in Ukra