In [1]:
'''
    Creating histograms from all output non-PII variables in baseline from the value_reconciliation files of each section's QC.

    Last updated: 06/11/2025
    Author: Tal Cohen
'''

"\n    Creating histograms from all output non-PII variables in baseline from the value_reconciliation files of each section's QC.\n\n    Last updated: 06/11/2025\n    Author: Tal Cohen\n"

In [2]:
import sys
import os
import pandas as pd
import numpy as np
import json
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\Schema_and_Derivation_utils"))
from config import validation_path

In [3]:
def _is_numberish(v) -> bool:
    if v is None:
        return False
    if isinstance(v, (int, float)):
        return True
    if isinstance(v, str):
        t = v.strip()
        if t == "":
            return False
        try:
            float(t)
            return True
        except ValueError:
            return False
    return False

In [4]:
def _histogram(ax, values, title: str):
    vals = [v for v in values if v is not None]
    if not vals:
        ax.text(0.5, 0.5, "No data", ha="center", va="center")
        ax.set_title(title)
        return

    is_numeric = all(_is_numberish(v) for v in vals)

    # Common bar width (acts as your minimum width)
    bar_width = 0.8

    if is_numeric:
        # Treat numeric as categories, no binning
        arr = [float(v) for v in vals]
        cnt = Counter(arr)

        # Sort numerically so x-axis is ordered
        items = sorted(cnt.items(), key=lambda x: x[0])
        labels = [str(v) for v, _ in items]
        freqs = [c for _, c in items]
        xpos = np.arange(len(labels))

        bars = ax.bar(xpos, freqs, width=bar_width)

        # Put counts on top of each bar
        for bar in bars:
            height = bar.get_height()
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                height,
                int(height),
                ha="center",
                va="bottom",
            )

        ax.set_xticks(xpos)
        ax.set_xticklabels(labels, rotation=45, ha="right")

    else:
        # Categorical: still limit to top 25
        cnt = Counter(vals)
        top = cnt.most_common(25)
        labels, freqs = zip(*top)
        xpos = np.arange(len(labels))

        bars = ax.bar(xpos, freqs, width=bar_width)

        # Counts on top of bars
        for bar in bars:
            height = bar.get_height()
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                height,
                int(height),
                ha="center",
                va="bottom",
            )

        ax.set_xticks(xpos)
        ax.set_xticklabels(labels, rotation=45, ha="right")

    ax.set_title(title)

In [5]:
def hist_creation(section):
    recon = os.path.join(validation_path, f'{section}_ValidationSummary', 'value_reconciliation.json')

    with open(recon, "r", encoding="utf-8") as f:
            recon_json = json.load(f)

    recon_details = recon_json.get(f"{section}_ValueReconciliation").get("reconciliation_details")

    output_pdf_path = os.path.join(validation_path, f'{section}_ValidationSummary', f'{section}_histograms.pdf')

    with PdfPages(output_pdf_path) as pdf:
        pages = 0
        for q in recon_details.keys():
            actual_freq = recon_details.get(q).get("actual_frequencies")

            # Expand into lists for plotting
            act_vals = []
            for k, cnt in actual_freq.items():
                k_val = None if k == "null" else k
                act_vals.extend([k_val] * int(cnt))

            fig = plt.figure(figsize=(11.69, 8.27))  # A4 landscape
            ax1 = fig.add_subplot(1, 1, 1)
            

            _histogram(ax1, act_vals, q)

            fig.tight_layout()
            pdf.savefig(fig)
            plt.close(fig)
            pages += 1

    return output_pdf_path

In [6]:
hist_creation("XRays")

'N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils\\\\Questionnaire\\\\R0\\validation\\XRays_ValidationSummary\\XRays_histograms.pdf'