# Loading the data

In [1]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [2]:
from src.data_management.label_parser import parse_json_for_narratives_subnarratives, create_label_mappings

taxonomy_path = 'data/taxonomy.json'
df = pd.read_parquet('data/processed/phase0_baseline.parquet')

all_narratives, all_subnarratives = parse_json_for_narratives_subnarratives(taxonomy_path)
label_to_id, id_to_label, narrative_to_subnarrative_ids = create_label_mappings(all_narratives, all_subnarratives)
all_ids = list(id_to_label.keys())

# convert the numpy arrays to lists
df['narratives'] = df['narratives'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['subnarratives'] = df['subnarratives'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['narrative_ids'] = df['narrative_ids'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['subnarrative_ids'] = df['subnarrative_ids'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Adding the bit vector labels to the data

In [3]:
from src.data_management.preprocessor import binarize_labels

# concat the narratives and subnarratives
df['labels'] = df.apply(lambda row: row['narrative_ids'] + row['subnarrative_ids'], axis=1)
# binarize the labels
df['labels'] = df['labels'].apply(lambda x: binarize_labels(x, all_ids))
# show the first 5 rows
print(df.head())

                   id                                               text  \
0          BG_670.txt  Опитът на колективния Запад да „обезкърви Руси...   
1  A7_URW_BG_4793.txt  Цончо Ганев, “Възраждане”: Обещали сме на Укра...   
2         BG_3245.txt  Подкрепата за Киев от страна на Запада вече не...   
3      A9_BG_5190.txt  Дмитрий Медведев: НПО-та, спонсорирани от Соро...   
4      A9_BG_3379.txt  Британски дипломат обвини Запада за украинския...   

                                          narratives  \
0  [URW: Blaming the war on others rather than th...   
1                        [URW: Discrediting Ukraine]   
2  [URW: Discrediting the West, Diplomacy, URW: D...   
3  [URW: Discrediting the West, Diplomacy, URW: D...   
4  [URW: Discrediting the West, Diplomacy, URW: P...   

                                       subnarratives language narrative_ids  \
0  [URW: Blaming the war on others rather than th...       BG  [11, 12, 14]   
1  [URW: Discrediting Ukraine: Situation in Ukra

In [4]:
output_parquet_path = 'data/processed/phase0_baseline_labeled.parquet'
df.to_parquet(output_parquet_path, index=False)
print(f"DataFrame exported to: {output_parquet_path}")

DataFrame exported to: data/processed/phase0_baseline_labeled.parquet


In [4]:
from transformers import AutoTokenizer

model = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df['tokenized_text'] = df['text'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=1024, return_tensors='pt'))

# Saving Tokenized Data in a Reloadable Format

The `tokenized_text` column currently contains PyTorch tensors. To save this to Parquet in a way that's easy to reload, we'll extract the `input_ids` and `attention_mask` and convert them to lists.

In [7]:
df['input_ids_list'] = df['tokenized_text'].apply(lambda x: x['input_ids'].squeeze().tolist())
df['attention_mask_list'] = df['tokenized_text'].apply(lambda x: x['attention_mask'].squeeze().tolist())

df_to_save = df[['text', 'labels', 'input_ids_list', 'attention_mask_list']].copy()

output_parquet_path_tokenized = 'data/processed/phase0_baseline_tokenized_and_labeled.parquet'
df_to_save.to_parquet(output_parquet_path_tokenized, index=False)

print(f"DataFrame with tokenized data (as lists) saved to: {output_parquet_path_tokenized}")
print("Preview of the saved DataFrame structure:")
print(df_to_save.head())

DataFrame with tokenized data (as lists) saved to: data/processed/phase0_baseline_tokenized_and_labeled.parquet
Preview of the saved DataFrame structure:
                                                text  \
0  Опитът на колективния Запад да „обезкърви Руси...   
1  Цончо Ганев, “Възраждане”: Обещали сме на Укра...   
2  Подкрепата за Киев от страна на Запада вече не...   
3  Дмитрий Медведев: НПО-та, спонсорирани от Соро...   
4  Британски дипломат обвини Запада за украинския...   

                                              labels  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...   
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...   
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...   

                                      input_ids_list  \
0  [0, 1089, 22617, 1669, 29, 47829, 2097, 32275,...   
1  [0, 160480, 108723, 45653, 3407, 4, 52, 2354, ...   
2  [

# Reloading Tokenized Data and Reconstructing Tensors

This cell demonstrates how to load the Parquet file saved above and convert the `input_ids_list`, `attention_mask_list`, and `labels` back into PyTorch tensors.

In [8]:
import pandas as pd
import torch

input_parquet_path_tokenized = 'data/processed/phase0_baseline_tokenized_and_labeled.parquet'

loaded_df = pd.read_parquet(input_parquet_path_tokenized)

loaded_df['input_ids_pt'] = loaded_df['input_ids_list'].apply(lambda x: torch.tensor(x, dtype=torch.long))
loaded_df['attention_mask_pt'] = loaded_df['attention_mask_list'].apply(lambda x: torch.tensor(x, dtype=torch.long))
loaded_df['labels_pt'] = loaded_df['labels'].apply(lambda x: torch.tensor(x, dtype=torch.float))

print("DataFrame reloaded and PyTorch tensors reconstructed:")
print(loaded_df[['text', 'labels_pt', 'input_ids_pt', 'attention_mask_pt']].head())

if not loaded_df.empty:
    sample_input_ids = loaded_df['input_ids_pt'].iloc[0]
    sample_attention_mask = loaded_df['attention_mask_pt'].iloc[0]
    sample_labels = loaded_df['labels_pt'].iloc[0]
    print("\nFirst sample's reconstructed tensors:")
    print("Input IDs shape:", sample_input_ids.shape, "dtype:", sample_input_ids.dtype)
    print("Attention Mask shape:", sample_attention_mask.shape, "dtype:", sample_attention_mask.dtype)
    print("Labels shape:", sample_labels.shape, "dtype:", sample_labels.dtype)
else:
    print("\nLoaded DataFrame is empty.")

DataFrame reloaded and PyTorch tensors reconstructed:
                                                text  \
0  Опитът на колективния Запад да „обезкърви Руси...   
1  Цончо Ганев, “Възраждане”: Обещали сме на Укра...   
2  Подкрепата за Киев от страна на Запада вече не...   
3  Дмитрий Медведев: НПО-та, спонсорирани от Соро...   
4  Британски дипломат обвини Запада за украинския...   

                                           labels_pt  \
0  [tensor(0.), tensor(0.), tensor(0.), tensor(0....   
1  [tensor(0.), tensor(0.), tensor(0.), tensor(0....   
2  [tensor(0.), tensor(0.), tensor(0.), tensor(0....   
3  [tensor(0.), tensor(0.), tensor(0.), tensor(0....   
4  [tensor(0.), tensor(0.), tensor(0.), tensor(0....   

                                        input_ids_pt  \
0  [tensor(0), tensor(1089), tensor(22617), tenso...   
1  [tensor(0), tensor(160480), tensor(108723), te...   
2  [tensor(0), tensor(10405), tensor(24724), tens...   
3  [tensor(0), tensor(154888), tensor(189322), t