BI010875 MCA Analysis

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, leaves_list
from graphviz import Digraph
from IPython.display import Image, display


Load the data

In [2]:

# ---- Load your data ----
file_path = "/home/king/projects/BI010875_MCA/Data_files/BI010875 MCA Analysis DATA MASTER v0.1.xlsx"

# headers start on row 4 (header=3)
df = pd.read_excel(file_path, sheet_name="Sheet1", header=3)


In [3]:
df.head()

Unnamed: 0,careepisodeID,IJD_IncidentNumber,IncidentDate,ChiefComplaint,MechanismDetail,ImpressionSuspectedDiagnosis,Mental Capacity Assessment Undertaken,Patient Does Have Capacity,Patient Does NOT Have Capacity,"Does the patient have an impairment or, a disturbance in the functioning of, their mind or brain at the moment?",Is the impairment or disturbance suffcient that the person lacks the capaity to make the decision at this time?,Does the patient understand the information relevant to the decision including the likely consequances...?,Can the patient retain that information?,Can the patient use or weigh that information as part of the process of making the decision?,Can the patient communicate that decision by any means?,Pathway_Comment,proposedCarePatientBestInterest,Service Outcome
0,23284264,S0308250927,2025-08-03,Neurological; Confusion,No Subcategory Recorded; No Subcategory Recorded,Infectious disease; Respiratory; Bronchopneumonia,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,
1,23298020,S0608250566,2025-08-06,Trauma / musculoskeletal; Backache (no recent ...,No Subcategory Recorded; No Subcategory Recorded,Soft tissue inj / wound; Muscle injury; Upper ...,Yes,Yes,,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,Yes,
2,23416083,S1512310313,2025-08-30,Neurological; Confusion,No Subcategory Recorded; No Subcategory Recorded,Infectious disease; GU / GI; Urinary tract inf...,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,
3,23318894,S1512312053,2025-08-10,Circulation / chest; Cardiac arrest,No Subcategory Recorded; No Subcategory Recorded,Trauma / FB; Complication of injury; Rhabdomyo...,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,
4,23332163,S2308131281,2025-08-13,Neurological; Falls / unsteady on feet,Fall; Tripping,Env / social / comp / NAD; Care process; No ab...,Yes,Yes,,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,


Clean it up

In [4]:

# ---- Drop non-analytic fields ----
df = df.drop(columns=[  "careepisodeID", "IncidentDate", "IJD_IncidentNumber", 
                        "ChiefComplaint", "MechanismDetail", 
                        "ImpressionSuspectedDiagnosis"
    ], errors="ignore")



In [5]:
# Replace NaN with 'Unknown' for clarity
df = df.fillna("NotRecorded")

Show all unique values in the data

In [6]:
df.head()

Unnamed: 0,Mental Capacity Assessment Undertaken,Patient Does Have Capacity,Patient Does NOT Have Capacity,"Does the patient have an impairment or, a disturbance in the functioning of, their mind or brain at the moment?",Is the impairment or disturbance suffcient that the person lacks the capaity to make the decision at this time?,Does the patient understand the information relevant to the decision including the likely consequances...?,Can the patient retain that information?,Can the patient use or weigh that information as part of the process of making the decision?,Can the patient communicate that decision by any means?,Pathway_Comment,proposedCarePatientBestInterest,Service Outcome
0,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded
1,Yes,Yes,NotRecorded,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,Yes,NotRecorded
2,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded
3,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded
4,Yes,Yes,NotRecorded,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded


Create Graphviz Digraph and build the tree

In [10]:
import pandas as pd
import plotly.graph_objects as go

# Example df_clean already defined

# All questions in order
questions = [
    "Mental Capacity Assessment Undertaken",
    "Patient Does Have Capacity",
    "Does the patient have an impairment or, a disturbance in the functioning of, their mind or brain at the moment?",
    "Is the impairment or disturbance suffcient that the person lacks the capaity to make the decision at this time?",
    "Does the patient understand the information relevant to the decision including the likely consequances...?",
    "Can the patient retain that information?",
    "Can the patient use or weigh that information as part of the process of making the decision?",
    "Can the patient communicate that decision by any means?",
    "proposedCarePatientBestInterest",
    "Service Outcome"
]

# Map questions to simple labels
question_labels = {q: f"Q{i+1}" for i, q in enumerate(questions)}

# Colors for answers
answer_colors = {"Yes": "lightgreen", "No": "lightcoral", "Not recorded": "lightgray", "NaN":"lightgray"}

# Create nodes
node_labels = []
node_map = {}
colors = []

for q in questions:
    for ans in df[q].unique():
        label = f"{question_labels[q]}\n{ans}"
        node_map[(q, ans)] = len(node_labels)
        node_labels.append(label)
        colors.append(answer_colors.get(ans, "lightblue"))

# Create edges
sources = []
targets = []
values = []
edge_labels = []

for i in range(len(questions)-1):
    q_from = questions[i]
    q_to = questions[i+1]
    counts = df.groupby([q_from, q_to]).size().reset_index(name='count')
    for _, row in counts.iterrows():
        sources.append(node_map[(q_from, row[q_from])])
        targets.append(node_map[(q_to, row[q_to])])
        values.append(row['count'])
        edge_labels.append(f"{row[q_from]} → {row[q_to]}: {row['count']}")

# Build Sankey diagram with smaller node font
fig = go.Figure(go.Sankey(
    node=dict(
        label=node_labels,
        color=colors,
        pad=15,
        thickness=30,
        line=dict(color='black', width=1),
        font=dict(size=12, color='black')  # node label font
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=[colors[s] for s in sources],
        hovertemplate='%{value} records<extra></extra>'
    ),
    domain=dict(x=[0,1], y=[0.33,1.0])
))

# Add legend for answer colors
for ans, color in answer_colors.items():
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode='markers', marker=dict(size=15, color=color),
        legendgroup=ans, showlegend=True, name=ans
    ))

# Annotation box below chart
annotation_text = "<pre>" + "\n".join([f"{question_labels[q]:<4} = {q}" for q in questions]) + "</pre>"

fig.update_layout(
    title_text="Patient Flow Through Questions to Service Outcome",
    font_size=14,
    width=1800,
    height=1200,
    xaxis_visible=False,
    yaxis_visible=False,
    annotations=[dict(
        text=annotation_text,
        x=0.5,
        y=0.02,
        showarrow=False,
        xref="paper",
        yref="paper",
        align="center",
        bordercolor="black",
        borderwidth=1,
        borderpad=10,
        bgcolor="white",
        font=dict(family="Courier New", size=12)  # smaller font in annotation
    )],
    margin=dict(l=50, r=50, t=50, b=150)
)

fig.show()


ValueError: Invalid property specified for object of type plotly.graph_objs.sankey.Node: 'font'

Did you mean "line"?

    Valid properties:
        align
            Sets the alignment method used to position the nodes
            along the horizontal axis.
        color
            Sets the `node` color. It can be a single value, or an
            array for specifying color for each `node`. If
            `node.color` is omitted, then the default `Plotly`
            color palette will be cycled through to have a variety
            of colors. These defaults are not fully opaque, to
            allow some visibility of what is beneath the node.
        colorsrc
            Sets the source reference on Chart Studio Cloud for
            `color`.
        customdata
            Assigns extra data to each node.
        customdatasrc
            Sets the source reference on Chart Studio Cloud for
            `customdata`.
        groups
            Groups of nodes. Each group is defined by an array with
            the indices of the nodes it contains. Multiple groups
            can be specified.
        hoverinfo
            Determines which trace information appear when hovering
            nodes. If `none` or `skip` are set, no information is
            displayed upon hovering. But, if `none` is set, click
            and hover events are still fired.
        hoverlabel
            :class:`plotly.graph_objects.sankey.node.Hoverlabel`
            instance or dict with compatible properties
        hovertemplate
            Template string used for rendering the information that
            appear on hover box. Note that this will override
            `hoverinfo`. Variables are inserted using %{variable},
            for example "y: %{y}" as well as %{xother}, {%_xother},
            {%_xother_}, {%xother_}. When showing info for several
            points, "xother" will be added to those with different
            x positions from the first point. An underscore before
            or after "(x|y)other" will add a space on that side,
            only when this field is shown. Numbers are formatted
            using d3-format's syntax %{variable:d3-format}, for
            example "Price: %{y:$.2f}".
            https://github.com/d3/d3-format/tree/v1.4.5#d3-format
            for details on the formatting syntax. Dates are
            formatted using d3-time-format's syntax
            %{variable|d3-time-format}, for example "Day:
            %{2019-01-01|%A}". https://github.com/d3/d3-time-
            format/tree/v2.2.3#locale_format for details on the
            date formatting syntax. The variables available in
            `hovertemplate` are the ones emitted as event data
            described at this link
            https://plotly.com/javascript/plotlyjs-events/#event-
            data. Additionally, every attributes that can be
            specified per-point (the ones that are `arrayOk: true`)
            are available.  Variables `sourceLinks` and
            `targetLinks` are arrays of link objects.Finally, the
            template string has access to variables `value` and
            `label`. Anything contained in tag `<extra>` is
            displayed in the secondary box, for example
            `<extra>%{fullData.name}</extra>`. To hide the
            secondary box completely, use an empty tag
            `<extra></extra>`.
        hovertemplatesrc
            Sets the source reference on Chart Studio Cloud for
            `hovertemplate`.
        label
            The shown name of the node.
        labelsrc
            Sets the source reference on Chart Studio Cloud for
            `label`.
        line
            :class:`plotly.graph_objects.sankey.node.Line` instance
            or dict with compatible properties
        pad
            Sets the padding (in px) between the `nodes`.
        thickness
            Sets the thickness (in px) of the `nodes`.
        x
            The normalized horizontal position of the node.
        xsrc
            Sets the source reference on Chart Studio Cloud for
            `x`.
        y
            The normalized vertical position of the node.
        ysrc
            Sets the source reference on Chart Studio Cloud for
            `y`.
        
Did you mean "line"?

Bad property path:
font
^^^^