In [None]:
import pandas as pd

In [None]:
import os

# Load results from Data Category Annotation from File

annotated_folder = "../dummy_data/gdpr_annotated_data"

all_files_info = []

for user_folder in os.listdir(annotated_folder):
    user_path = os.path.join(annotated_folder, user_folder)
    if os.path.isdir(user_path):
        for conv_folder in os.listdir(user_path):
            conv_path = os.path.join(user_path, conv_folder)
            if os.path.isdir(conv_path):
                for file_name in os.listdir(conv_path):
                    file_path = os.path.join(conv_path, file_name)
                    if os.path.isfile(file_path):
                        with open(file_path, "r", encoding="utf-8") as f:
                            file_content = f.read()
                        all_files_info.append({
                            "user_id": user_folder,
                            "conversation_id": conv_folder,
                            "file_name": file_name,
                            "file_path": file_path,
                            "content": eval(file_content)
                        })


In [None]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Set seaborn style
sns.set_theme(style="whitegrid", context="paper", font_scale=1.2)
sns.set_palette("colorblind")

# Matplotlib settings
import matplotlib as mpl
mpl.rcParams['axes.labelweight'] = 'bold'
mpl.rcParams['axes.titlesize'] = 'x-large'
mpl.rcParams['xtick.labelsize'] = 'large'
mpl.rcParams['ytick.labelsize'] = 'large'
mpl.rcParams['legend.fontsize'] = 'large'
mpl.rcParams['figure.dpi'] = 100
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['axes.labelsize'] = 'x-large'
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif']


In [None]:
### From now on, set everything where the model did not "agree" or "strongly agree" to have the psychological information removed.
for entry in all_files_info:
    answer = entry.get("content", {}).get("verification", {}).get("answer", "")
    if answer in {"C", "D", "E"}:
        # Remove psychological information from categorization
        if "categorization" in entry.get("content", {}):
            entry["content"]["categorization"]["psychological_information"] = []
        # Remove attribute mapping
        entry["content"]["attribute_mapping"] = {}

In [None]:
records = []

for file_info in all_files_info:
    file_name = file_info.get("file_name", "")
    content = file_info.get("content", {})

    categorization = content.get("categorization", {})
    factual = categorization.get("factual_information", None)
    psych = categorization.get("psychological_information", None)
    contains_factual = int(isinstance(factual, list) and len(factual) > 0)
    contains_psychological = int(isinstance(psych, list) and len(psych) > 0)

    gdpr_analysis = content.get("gdpr_analysis", [])
    gdpr_categories = []
    if isinstance(gdpr_analysis, dict):
        gdpr_analysis = [gdpr_analysis]

    for item in gdpr_analysis:
        results = item.get("result", [])
        if isinstance(results, dict):
            results = [results]
        for res in results:
            cat = res.get("category")
            gdpr_categories.append(cat)

    contains_personal_data = int(any(cat in ["personal_data"] for cat in gdpr_categories))
    contains_special_category_data = int(any(cat in ["special_category_data"] for cat in gdpr_categories))

    attribute_mapping = content.get("attribute_mapping", {})
    att_result = attribute_mapping.get("result", [])
    if isinstance(att_result, dict):
        att_result = [att_result]
    elif "result" not in attribute_mapping and "item" in attribute_mapping:
        att_result = [attribute_mapping]

    attributes = []
    for item in att_result:
        attr = item.get("attribute")
        if attr:
            attributes.append(attr)

    records.append({
        "file_name": file_name,
        "user_id": file_info.get("user_id", ""),
        "conversation_id": file_info.get("conversation_id", ""),
        "contains_factual": contains_factual,
        "contains_psychological": contains_psychological,
        "contains_personal_data": contains_personal_data,
        "contains_special_category_data": contains_special_category_data,
    })

model_df = pd.DataFrame(records)
model_df.head()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Count the number of samples that contain personal data and special category data
total_samples = len(model_df)
count_contains_personal = model_df["contains_personal_data"].sum()
count_contains_special = model_df["contains_special_category_data"].sum()

# Calculate percentages
pct_personal = np.round(count_contains_personal / total_samples * 100, 0) if total_samples > 0 else 0
pct_special = np.round(count_contains_special / total_samples * 100, 0) if total_samples > 0 else 0

labels = ["Personal\nGDPR 4(1)", "Special\nCategory\nGDPR 4(9)"]
values = [pct_personal, pct_special]
counts = [count_contains_personal, count_contains_special]
bar_colors = ["#9BBB59", "#4F81BD"]  # green, blue

if total_samples == 0:
    print("No samples found for distribution analysis.")
else:
    plt.figure(figsize=(4,3))
    bars = plt.bar(labels, values, color=bar_colors, edgecolor="black")
    plt.ylabel("Percentage", fontsize=12, fontweight="bold")
    plt.xticks(fontsize=11, ha='center', fontweight="bold")
    plt.yticks(fontsize=11, fontweight="bold")

    # Annotate bars with percentage and absolute counts
    for i, (bar, pct, count) in enumerate(zip(bars, values, counts)):
        height = bar.get_height()
        percent_str = f"{pct:.0f}%"
        label_str = f"{percent_str}\n(n={count})"
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + max(values)*0.01 if len(values) > 0 else 0.5,
            label_str,
            ha='center',
            va='bottom',
            fontsize=13,
            fontweight="bold"
        )

    plt.ylim(0, 100)
    plt.tight_layout()
    plt.savefig("gdpr_contains_personal_special_bar.pdf", bbox_inches="tight")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate for each user if they *ever* had a row for each type
users_with_personal = set(model_df.loc[model_df["contains_personal_data"] == 1, "user_id"].unique())
users_with_special = set(model_df.loc[model_df["contains_special_category_data"] == 1, "user_id"].unique())
all_users = set(model_df["user_id"].unique())

n_users_total = len(all_users)
n_users_personal = len(users_with_personal)
n_users_special = len(users_with_special)

labels = ["Personal\nGDPR 4(1)", "Special\nCategory\nGDPR 4(9)"]
values = [n_users_personal, n_users_special]
bar_colors = ["#9BBB59", "#4F81BD"]

if n_users_total == 0:
    print("No users found for user-level distribution analysis.")
else:
    plt.figure(figsize=(4, 3))
    bars = plt.bar(labels, values, color=bar_colors, edgecolor="black")
    plt.ylabel("Number of Users", fontsize=12, fontweight="bold")
    plt.xticks(fontsize=11, ha='center', fontweight="bold",)
    plt.yticks(fontsize=11, fontweight="bold")

    # Annotate bars with unique user counts and their percentage
    for i, (bar, count) in enumerate(zip(bars, values)):
        height = bar.get_height()
        percent = (count / n_users_total * 100) if n_users_total > 0 else 0
        label_str = f"{percent:.0f}%\n(n={count})"
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + max(values)*0.01 if len(values) > 0 else 0.5,
            label_str,
            ha='center',
            va='bottom',
            fontsize=13,
            fontweight="bold"
        )

    plt.ylim(0, max(values) * 1.15 if len(values) > 0 else 10)
    plt.tight_layout()
    plt.savefig("gdpr_unique_users_personal_special_bar.pdf", bbox_inches="tight")

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

# Gather all "data_type" values from the gdpr_analysis.result field
data_types = []
for entry in all_files_info:
    gdpr = entry.get("content", {}).get("gdpr_analysis", [])
    if isinstance(gdpr, dict):
        gdpr = [gdpr]
    for item in gdpr:
        # If "result" is a list or dict, normalize to list
        results = item.get("result", [])
        if isinstance(results, dict):
            results = [results]
        for res in results:
            if res.get("category") != "personal_data":
                continue
            dt = res.get("data_type")
            if dt:
                data_types.append(dt)

# Count occurrences of each data_type
data_type_counts = Counter(data_types)

def clean_label(label):
    # Remove " identity" from the end if present, and capitalize the label
    if label.lower().endswith(" identity"):
        label = label[: -len(" identity")]
    label = label.capitalize()
    label = label.replace("Identification", "ID")
    return label

if not data_type_counts:
    print("No data_type values found in gdpr_analysis.result.")
else:
    # Sort by count descending and take only the top 10
    sorted_items = sorted(data_type_counts.items(), key=lambda x: -x[1])[:8]
    plot_labels = [clean_label(k) for k, v in sorted_items]
    plot_counts = [v for k, v in sorted_items]
    total = sum(plot_counts)
    percentages = [np.round(count / total * 100, 0) if total > 0 else 0 for count in plot_counts]

    # Assign colors
    base_colors = ["#4F81BD", "#9BBB59", "#C0504D", "#8064A2", "#F79646", "#2C4D75", "#77933C"]
    plot_colors = [base_colors[i % len(base_colors)] for i in range(len(plot_labels))]

    plt.figure(figsize=(4,3))
    bars = plt.bar(plot_labels, percentages, color=plot_colors, edgecolor="black")
    plt.ylabel("Percentage", fontsize=12)
    plt.xticks(fontsize=10, rotation=50, ha='right')
    plt.yticks(fontsize=11)

    # Add value labels on top of bars (as percentages)
    for bar, pct in zip(bars, percentages):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + 1,
            f"{pct:.0f}%",
            ha='center',
            va='bottom',
            fontsize=9
        )

    plt.ylim(0, 100)
    plt.tight_layout()
    plt.savefig("gdpr_personal_data_type_distribution.pdf", bbox_inches="tight")    

In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np

# Build a mapping: data_type -> set of user_ids
data_type_to_users = defaultdict(set)
all_user_ids = set()

for entry in all_files_info:
    user_id = entry.get("user_id")
    if user_id:
        all_user_ids.add(user_id)
    gdpr = entry.get("content", {}).get("gdpr_analysis", [])
    if isinstance(gdpr, dict):
        gdpr = [gdpr]
    for item in gdpr:
        # If "result" is a list or dict, normalize to list
        results = item.get("result", [])
        if isinstance(results, dict):
            results = [results]
        for res in results:
            if res.get("category") != "personal_data":
                continue
            dt = res.get("data_type")
            if dt and user_id:
                data_type_to_users[dt].add(user_id)

def clean_label(label):
    # Remove " identity" from the end if present, and capitalize the label
    if label.lower().endswith(" identity"):
        label = label[: -len(" identity")]
    label = label.capitalize()
    label = label.replace("Identification", "ID")
    return label

if not data_type_to_users:
    print("No data_type values found in gdpr_analysis.result.")
else:
    # Count the number of unique users for each data_type
    user_counts = {dt: len(users) for dt, users in data_type_to_users.items()}
    # Sort by user count descending and take only the top 10
    sorted_items = sorted(user_counts.items(), key=lambda x: -x[1])[:8]
    plot_labels = [clean_label(k) for k, v in sorted_items]
    plot_counts = [v for k, v in sorted_items]
    total_users = len(all_user_ids)
    percentages = [np.round(count / total_users * 100, 0) if total_users > 0 else 0 for count in plot_counts]

    # Assign colors
    base_colors = ["#4F81BD", "#9BBB59", "#C0504D", "#8064A2", "#F79646", "#2C4D75", "#77933C"]
    plot_colors = [base_colors[i % len(base_colors)] for i in range(len(plot_labels))]

    plt.figure(figsize=(4,3))
    bars = plt.bar(plot_labels, percentages, color=plot_colors, edgecolor="black")
    plt.ylabel("Percentage", fontsize=12)
    plt.xticks(fontsize=10, rotation=50, ha='right')
    plt.yticks(fontsize=11)

    # Add value labels on top of bars (as percentages)
    for bar, pct in zip(bars, percentages):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + 1,
            f"{pct:.0f}%",
            ha='center',
            va='bottom',
            fontsize=8
        )

    plt.ylim(0, 100)
    plt.tight_layout()
    plt.savefig("gdpr_personal_data_type_user_distribution.pdf", bbox_inches="tight")


In [None]:
# Gather all "data_type" values from the gdpr_analysis.result field
data_types = []
for entry in all_files_info:
    gdpr = entry.get("content", {}).get("gdpr_analysis", [])
    if isinstance(gdpr, dict):
        gdpr = [gdpr]
    for item in gdpr:
        # If "result" is a list or dict, normalize to list
        results = item.get("result", [])
        if isinstance(results, dict):
            results = [results]
        for res in results:
            if res.get("category") != "special_category_data":
                continue
            dt = res.get("data_type")
            if dt:
                data_types.append(dt)

# Count occurrences of each data_type
data_type_counts = Counter(data_types)

def clean_label(label):
    return label.capitalize()

if not data_type_counts:
    print("No data_type values found in gdpr_analysis.result.")
else:
    # Sort by count descending and take only the top 10
    sorted_items = sorted(data_type_counts.items(), key=lambda x: -x[1])[:10]
    plot_labels = [clean_label(k) for k, v in sorted_items]
    plot_counts = [v for k, v in sorted_items]
    total = sum(plot_counts)
    percentages = [np.round(count / total * 100, 0) if total > 0 else 0 for count in plot_counts]

    # Assign colors
    base_colors = ["#4F81BD", "#9BBB59", "#C0504D", "#8064A2", "#F79646", "#2C4D75", "#77933C"]
    plot_colors = [base_colors[i % len(base_colors)] for i in range(len(plot_labels))]

    plt.figure(figsize=(4,3))
    bars = plt.bar(plot_labels, percentages, color=plot_colors, edgecolor="black")
    plt.ylabel("Percentage", fontsize=12)
    plt.xticks(fontsize=10, rotation=50, ha='right')
    plt.yticks(fontsize=11)

    # Add value labels on top of bars (as percentage)
    for bar, pct in zip(bars, percentages):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + 1,
            f"{pct:.0f}%",
            ha='center',
            va='bottom',
            fontsize=9
        )

    plt.ylim(0, 100)
    plt.tight_layout()
    plt.savefig("gdpr_special_data_type_distribution.pdf", bbox_inches="tight")    

In [None]:
# Build a mapping: special_data_type -> set of user_ids
from collections import defaultdict

special_data_type_to_users = defaultdict(set)
all_user_ids = set()

for entry in all_files_info:
    user_id = entry.get("user_id")
    if user_id:
        all_user_ids.add(user_id)
    gdpr = entry.get("content", {}).get("gdpr_analysis", [])
    if isinstance(gdpr, dict):
        gdpr = [gdpr]
    for item in gdpr:
        results = item.get("result", [])
        if isinstance(results, dict):
            results = [results]
        for res in results:
            if res.get("category") != "special_category_data":
                continue
            dt = res.get("data_type")
            if dt and user_id:
                special_data_type_to_users[dt].add(user_id)

def clean_label(label):
    # Remove " identity" from the end if present, and capitalize the label
    if label.lower().endswith(" identity"):
        label = label[: -len(" identity")]
    label = label.capitalize()
    label = label.replace("Identification", "ID")
    return label

if not special_data_type_to_users:
    print("No data_type values found in gdpr_analysis.result.")
else:
    # Count the number of unique users for each special data_type
    user_counts = {dt: len(users) for dt, users in special_data_type_to_users.items()}
    # Sort by user count descending and take only the top 10
    sorted_items = sorted(user_counts.items(), key=lambda x: -x[1])[:10]
    plot_labels = [clean_label(k) for k, v in sorted_items]
    plot_counts = [v for k, v in sorted_items]
    total_users = len(all_user_ids)
    percentages = [np.round(count / total_users * 100, 0) if total_users > 0 else 0 for count in plot_counts]

    # Assign colors
    base_colors = ["#4F81BD", "#9BBB59", "#C0504D", "#8064A2", "#F79646", "#2C4D75", "#77933C"]
    plot_colors = [base_colors[i % len(base_colors)] for i in range(len(plot_labels))]

    plt.figure(figsize=(4,3))
    bars = plt.bar(plot_labels, percentages, color=plot_colors, edgecolor="black")
    plt.ylabel("Percentage", fontsize=12)
    plt.xticks(fontsize=10, rotation=50, ha='right')
    plt.yticks(fontsize=11)

    # Add value labels on top of bars (as percentages)
    for bar, pct in zip(bars, percentages):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + 1,
            f"{pct:.0f}%",
            ha='center',
            va='bottom',
            fontsize=8
        )

    plt.ylim(0, 100)
    plt.tight_layout()
    plt.savefig("gdpr_special_data_type_user_distribution.pdf", bbox_inches="tight")