# Data Loading and Label Mapping

This notebook demonstrates how to load the annotation data and map narrative and subnarrative labels to their corresponding IDs using the provided taxonomy and helper functions.

In [None]:
# Add the project root to sys.path so we can import from src
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
import os
from src.data_management.label_parser import parse_json_for_narratives_subnarratives, create_label_mappings
from src.data_management.loaders import load_all_annotations_to_df, load_ids_to_df

In [None]:
print(os.getcwd())

In [None]:
# Define the path to the taxonomy JSON file
taxonomy_path = os.path.join('data', 'taxonomy.json')

# Parse narratives and subnarratives from the taxonomy
narratives, subnarratives = parse_json_for_narratives_subnarratives(taxonomy_path)

# Create label mappings
label_to_id, id_to_label, narrative_to_subnarrative_ids = create_label_mappings(narratives, subnarratives)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a directed graph for only two narratives and their subnarratives
G = nx.DiGraph()

# Select only the first two narratives
selected_narrative_ids = list(narrative_to_subnarrative_ids.keys())[:2]
for narrative_id in selected_narrative_ids:
    # Truncate narrative label to short name (remove prefix before colon, if any)
    narrative_label = id_to_label[narrative_id]
    short_narrative_label = narrative_label.split(":")[-1].strip() if ":" in narrative_label else narrative_label
    G.add_node(short_narrative_label)
    for sub_id in narrative_to_subnarrative_ids[narrative_id]:
        sub_label = id_to_label[sub_id]
        short_sub_label = sub_label.split(":")[-1].strip() if ":" in sub_label else sub_label
        G.add_node(short_sub_label)
        G.add_edge(short_narrative_label, short_sub_label)

# Draw the graph using a hierarchical layout
plt.figure(figsize=(12, 8))
pos = nx.nx_agraph.graphviz_layout(G, prog='dot')
nx.draw(G, pos, with_labels=True, arrows=False, node_size=2000, font_size=8, node_color='lightblue')
plt.title("Snapshot: Two Narratives and Their Subnarratives")
plt.tight_layout()
plt.show()

In [None]:
# Load all annotation data into a DataFrame
annotations_df = load_all_annotations_to_df()

# Display the first few rows
annotations_df.head()

In [None]:
# Map narrative and subnarrative labels to their corresponding IDs
annotations_with_ids = load_ids_to_df(annotations_df, label_to_id)

# Display the first few rows with IDs
annotations_with_ids.head()

In [None]:
# Save the annotations_df DataFrame to a parquet file
output_path = os.path.join('data', 'processed', 'phase0_baseline.parquet')
annotations_df.to_parquet(output_path, index=False)