# Label Distribution Analysis

In this section, we will analyze and visualize the overall distribution of labels in the dataset using bar plots.

In [None]:
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
df = pd.read_parquet('data/processed/phase0_baseline.parquet')

df.head()

In [None]:
import numpy as np

# Inspect the type and content of the first few entries in 'narratives' and 'subnarratives'
print('narratives column:')
print(df['narratives'].head())
print('Type of first entry:', type(df['narratives'].iloc[0]))
print('\nsubnarratives column:')
print(df['subnarratives'].head())
print('Type of first entry:', type(df['subnarratives'].iloc[0]))

# If any entry in 'narratives' or 'subnarratives' is a numpy array, convert the entire column to lists

df['narratives'] = df['narratives'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['subnarratives'] = df['subnarratives'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['narrative_ids'] = df['narrative_ids'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
df['subnarrative_ids'] = df['subnarrative_ids'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Narrative and subnarrative distribution

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Flatten and count narratives
narratives_flat = [narr for sublist in df['narratives'] for narr in (sublist.tolist() if hasattr(sublist, 'tolist') else list(sublist))]
narrative_counts = Counter(narratives_flat)

plt.figure(figsize=(12, 6))
plt.bar(list(narrative_counts.keys()), list(narrative_counts.values()))
plt.xticks(rotation=90)
plt.title('Narrative Distribution')
plt.xlabel('Narrative')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Flatten and count subnarratives
subnarratives_flat = [subnarr for sublist in df['subnarratives'] for subnarr in (sublist.tolist() if hasattr(sublist, 'tolist') else list(sublist))]
subnarrative_counts = Counter(subnarratives_flat)

plt.figure(figsize=(14, 6))
plt.bar(list(subnarrative_counts.keys()), list(subnarrative_counts.values()))
plt.xticks(rotation=90)
plt.title('Subnarrative Distribution')
plt.xlabel('Subnarrative')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import cm
import numpy as np

narrative_counts = Counter([narr for sublist in df['narratives'] for narr in sublist])
labels = list(narrative_counts.keys())
sizes = list(narrative_counts.values())

# Custom color list (prettier and more distinct)
custom_colors = [
    '#4E79A7', '#F28E2B', '#E15759', '#76B7B2', '#59A14F',
    '#EDC948', '#B07AA1', '#FF9DA7', '#9C755F', '#BAB0AC',
    '#A0CBE8', '#FFBE7D', '#FF9D9A', '#86BCB6', '#8CD17D',
    '#F1CE63', '#D4A6C8', '#FABFD2', '#D7B5A6', '#DCDBC1'
]
# Repeat colors if there are more labels than colors
colors = (custom_colors * ((len(labels) // len(custom_colors)) + 1))[:len(labels)]

plt.figure(figsize=(8, 8))
wedges, texts = plt.pie(sizes, colors=colors, startangle=140)
plt.title('Narrative Proportions')

# Add legend with labels
plt.legend(wedges, labels, title="Narratives", loc="center left", bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

# Number of documents with a certain number of labels

In [None]:
df['num_narratives'] = df['narratives'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['num_subnarratives'] = df['subnarratives'].apply(lambda x: len(x) if isinstance(x, list) else 0)


In [None]:
# Plot number of documents per number of narratives
plt.figure(figsize=(8, 4))
df['num_narratives'].value_counts().sort_index().plot(kind='bar')
plt.xlabel('Number of Narratives')
plt.ylabel('Number of Documents')
plt.title('Documents per Number of Narratives')
plt.tight_layout()
plt.show()

# Plot number of documents per number of subnarratives
plt.figure(figsize=(8, 4))
df['num_subnarratives'].value_counts().sort_index().plot(kind='bar')
plt.xlabel('Number of Subnarratives')
plt.ylabel('Number of Documents')
plt.title('Documents per Number of Subnarratives')
plt.tight_layout()
plt.show()

# Number of documents per language

In [None]:
languge_documents_counts = df['language'].value_counts()
plt.figure(figsize=(8, 4))
languge_documents_counts.plot(kind='bar')
plt.xlabel('Language')
plt.ylabel('Number of Documents')
plt.title('Number of Documents per Language')
plt.tight_layout()
plt.show()

In [None]:
avg_narr_labels_by_lang = df.groupby('language')['num_narratives'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
avg_narr_labels_by_lang.plot(kind='bar')
plt.title('Average Number of Narrative Labels per Document by Language')
plt.xlabel('Language')
plt.ylabel('Average Narrative Labels')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

avg_subnarr_labels_by_lang = df.groupby('language')['num_subnarratives'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
avg_subnarr_labels_by_lang.plot(kind='bar')
plt.title('Average Number of Subnarrative Labels per Document by Language')
plt.xlabel('Language')
plt.ylabel('Average Subnarrative Labels')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Labels distribution per language



In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

# Explode narratives and subnarratives to have one row per label per document
df_exploded_narratives = df.explode('narratives')
df_exploded_subnarratives = df.explode('subnarratives')

# Calculate narrative label distribution per language
narrative_lang_dist = pd.crosstab(df_exploded_narratives['language'], df_exploded_narratives['narratives'])

# Calculate subnarrative label distribution per language
subnarrative_lang_dist = pd.crosstab(df_exploded_subnarratives['language'], df_exploded_subnarratives['subnarratives'])

# Plot heatmap for narratives
plt.figure(figsize=(15, 8))
sns.heatmap(narrative_lang_dist, annot=True, fmt="d", cmap="viridis")
plt.title('Narrative Label Distribution per Language')
plt.ylabel('Language')
plt.xlabel('Narrative')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Plot heatmap for subnarratives
plt.figure(figsize=(18, 10))
sns.heatmap(subnarrative_lang_dist, annot=True, fmt="d", cmap="viridis")
plt.title('Subnarrative Label Distribution per Language')
plt.ylabel('Language')
plt.xlabel('Subnarrative')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


# Label cooccurrence

In [None]:
# Compute the cooccurrence matrix for narrative labels
from itertools import combinations
import numpy as np

# Create a zero-filled DataFrame for cooccurrence
narratives_list = sorted(set(narr for sublist in df['narratives'] for narr in sublist))
cooccurrence_matrix = pd.DataFrame(0.0, index=narratives_list, columns=narratives_list)

# Count cooccurrences
for labels in df['narratives']:
    for a, b in combinations(sorted(set(labels)), 2):
        cooccurrence_matrix.loc[a, b] += 1
        cooccurrence_matrix.loc[b, a] += 1

# Set diagonal to NaN so it appears as gray in the heatmap
np.fill_diagonal(cooccurrence_matrix.values, np.nan)

plt.figure(figsize=(14, 12))
sns.heatmap(
    cooccurrence_matrix,
    annot=True,  # Show numbers
    fmt='.0f',   # No decimals
    cmap="viridis",  # Different color scheme
    square=True,
    mask=np.isnan(cooccurrence_matrix),
    cbar_kws={'label': 'Cooccurrence Count'},
    linewidths=0.5,
    linecolor='gray',
)
plt.title('Narrative Label Cooccurrence Heatmap (Diagonal = Gray)')
plt.xlabel('Narrative Label')
plt.ylabel('Narrative Label')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Calculate word counts for each document and plot histogram by bins

def count_words(text):
    if isinstance(text, str):
        return len(text.split())
    return 0

# If your dataframe has a column like 'text' or 'document', adjust the column name below:
df['word_count'] = df['text'].apply(count_words)

# Define bins for word counts (adjust as needed)
bins = [0, 100, 250, 500, 1000, 2000, 5000, 10000, float('inf')]
labels = ['0-100', '101-250', '251-500', '501-1000', '1001-2000', '2001-5000', '5001-10000', '10000+']
df['word_count_bin'] = pd.cut(df['word_count'], bins=bins, labels=labels, right=True)

# Plot histogram of document counts per bin
plt.figure(figsize=(10, 6))
df['word_count_bin'].value_counts(sort=False).plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel('Word Count Bin')
plt.ylabel('Number of Documents')
plt.title('Document Count per Word Count Bin')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot for document word counts
plt.figure(figsize=(8, 6))
sns.boxplot(y=df['word_count'], color='lightgreen')
plt.ylabel('Word Count')
plt.title('Boxplot of Document Word Counts')
plt.tight_layout()
plt.show()

In [None]:
df.to_parquet('data/processed/phase0_baseline.parquet', index=False)