BI010875 MCA Analysis

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, leaves_list
from graphviz import Digraph
from IPython.display import Image, display


Load the data

In [2]:

# ---- Load your data ----
file_path = "/home/king/projects/BI010875_MCA/Data_files/BI010875 MCA Analysis DATA MASTER v0.1.xlsx"

# headers start on row 4 (header=3)
df = pd.read_excel(file_path, sheet_name="Sheet1", header=3)


In [3]:
df.head()

Unnamed: 0,careepisodeID,IJD_IncidentNumber,IncidentDate,ChiefComplaint,MechanismDetail,ImpressionSuspectedDiagnosis,Mental Capacity Assessment Undertaken,Patient Does Have Capacity,Patient Does NOT Have Capacity,"Does the patient have an impairment or, a disturbance in the functioning of, their mind or brain at the moment?",Is the impairment or disturbance suffcient that the person lacks the capaity to make the decision at this time?,Does the patient understand the information relevant to the decision including the likely consequances...?,Can the patient retain that information?,Can the patient use or weigh that information as part of the process of making the decision?,Can the patient communicate that decision by any means?,Pathway_Comment,proposedCarePatientBestInterest,Service Outcome
0,23284264,S0308250927,2025-08-03,Neurological; Confusion,No Subcategory Recorded; No Subcategory Recorded,Infectious disease; Respiratory; Bronchopneumonia,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,
1,23298020,S0608250566,2025-08-06,Trauma / musculoskeletal; Backache (no recent ...,No Subcategory Recorded; No Subcategory Recorded,Soft tissue inj / wound; Muscle injury; Upper ...,Yes,Yes,,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,Yes,
2,23416083,S1512310313,2025-08-30,Neurological; Confusion,No Subcategory Recorded; No Subcategory Recorded,Infectious disease; GU / GI; Urinary tract inf...,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,
3,23318894,S1512312053,2025-08-10,Circulation / chest; Cardiac arrest,No Subcategory Recorded; No Subcategory Recorded,Trauma / FB; Complication of injury; Rhabdomyo...,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,
4,23332163,S2308131281,2025-08-13,Neurological; Falls / unsteady on feet,Fall; Tripping,Env / social / comp / NAD; Care process; No ab...,Yes,Yes,,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,,NotRecorded,


Clean it up

In [4]:

# ---- Drop non-analytic fields ----
df = df.drop(columns=[  "careepisodeID", "IncidentDate", "IJD_IncidentNumber", 
                        "ChiefComplaint", "MechanismDetail", 
                        "ImpressionSuspectedDiagnosis"
    ], errors="ignore")



In [5]:
# Replace NaN with 'Unknown' for clarity
df = df.fillna("NotRecorded")

Show all unique values in the data

In [6]:
df.head()

Unnamed: 0,Mental Capacity Assessment Undertaken,Patient Does Have Capacity,Patient Does NOT Have Capacity,"Does the patient have an impairment or, a disturbance in the functioning of, their mind or brain at the moment?",Is the impairment or disturbance suffcient that the person lacks the capaity to make the decision at this time?,Does the patient understand the information relevant to the decision including the likely consequances...?,Can the patient retain that information?,Can the patient use or weigh that information as part of the process of making the decision?,Can the patient communicate that decision by any means?,Pathway_Comment,proposedCarePatientBestInterest,Service Outcome
0,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded
1,Yes,Yes,NotRecorded,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,Yes,NotRecorded
2,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded
3,Yes,No,Yes,Yes,Yes,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded
4,Yes,Yes,NotRecorded,No,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded,NotRecorded


Create Graphviz Digraph and build the tree

In [7]:
import pandas as pd
import plotly.graph_objects as go

# Example df_clean already defined

# All questions in order
questions = [
    "Mental Capacity Assessment Undertaken",
    "Patient Does Have Capacity",
    "Does the patient have an impairment or, a disturbance in the functioning of, their mind or brain at the moment?",
    "Is the impairment or disturbance suffcient that the person lacks the capaity to make the decision at this time?",
    "Does the patient understand the information relevant to the decision including the likely consequances...?",
    "Can the patient retain that information?",
    "Can the patient use or weigh that information as part of the process of making the decision?",
    "Can the patient communicate that decision by any means?",
    "proposedCarePatientBestInterest",
    "Service Outcome"
]

# Map questions to simple labels
question_labels = {q: f"Q{i+1}" for i, q in enumerate(questions)}

# Colors for answers
answer_colors = {"Yes": "lightgreen", "No": "lightcoral", "Not recorded": "lightgray", "NaN":"lightgray"}

# Create nodes
node_labels = []
node_map = {}
colors = []

for q in questions:
    for ans in df[q].unique():
        label = f"{question_labels[q]}\n{ans}"
        node_map[(q, ans)] = len(node_labels)
        node_labels.append(label)
        colors.append(answer_colors.get(ans, "lightblue"))

# Create edges
sources = []
targets = []
values = []
edge_labels = []

for i in range(len(questions)-1):
    q_from = questions[i]
    q_to = questions[i+1]
    counts = df.groupby([q_from, q_to]).size().reset_index(name='count')
    for _, row in counts.iterrows():
        sources.append(node_map[(q_from, row[q_from])])
        targets.append(node_map[(q_to, row[q_to])])
        values.append(row['count'])
        edge_labels.append(f"{row[q_from]} → {row[q_to]}: {row['count']}")

# Build Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        label=node_labels,
        color=colors,
        pad=15,
        thickness=30,
        line=dict(color='black', width=1)
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=[colors[s] for s in sources],
        hovertemplate='%{value} records<extra></extra>'
    ),
    domain=dict(
        x=[0, 1],      # full width
        y=[0.25, 1.0]  # bottom starts at 1/3 of figure height
    )
))

# Add legend for answer colors
for ans, color in answer_colors.items():
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode='markers', marker=dict(size=15, color=color),
        legendgroup=ans, showlegend=True, name=ans
    ))

# Keep annotation below the x-axis
annotation_text = "<br>".join([f"{question_labels[q]} = {q}" for q in questions])

fig.update_layout(
    title_text="Patient Flow Through Questions to Service Outcome",
    font_size=12,
    width=1200,
    height=900,
    xaxis_visible=False,
    yaxis_visible=False,
    annotations=[dict(
        text=annotation_text,
        x=0.5,
        y=-0.01,  # slightly below the Sankey (since y=0.33 is bottom of Sankey)
        showarrow=False,
        xref="paper",
        yref="paper",
        align="left",
        bordercolor="black",
        borderwidth=1,
        borderpad=10,
        bgcolor="white"
    )],
    margin=dict(l=50, r=50, t=100, b=50),  # leave bottom margin for annotation
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.005,
        xanchor="center",
        x=0.5
    )
)

fig.show()
