In [16]:
# ---------------------------------------------------
# 0 – Imports & Raw-Folder Ingestion
# ---------------------------------------------------

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import re
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# --------------------------------------------------------------------------
# ==> IMPORTANT: Point this to the directory containing your CSV files <==
# --------------------------------------------------------------------------
# The path below is a placeholder from your script.
# You MUST update it to the correct location on your computer.
raw_dir   = "/Users/rosstaylor/Downloads/Research Project/Code Folder/nhs-diagnostics-dids-eda/nhs-dids-explorer/data/raw/2024 NHS SW ICBs"

# If you don't want to use a folder, you can hard-code the list of file paths:
# csv_files = ["path/to/your/file1.csv", "path/to/your/file2.csv"]

try:
    csv_files = glob.glob(os.path.join(raw_dir, "*.csv"))
    if not csv_files:
        raise FileNotFoundError("csv_files list is empty – check the 'raw_dir' path.")
except FileNotFoundError as e:
    print(e)
    # As a fallback for demonstration, create a dummy dataframe
    print("Creating a sample DataFrame for demonstration purposes.")
    df = pd.DataFrame({
        'age': np.random.randint(0, 100, 5000),
        'modality': np.random.choice([
            'Computerized axial tomography (procedure)', 'Magnetic resonance imaging (procedure)', 'Endoscopy (procedure)',
            'Plain radiography (procedure)', 'Diagnostic ultrasonography (procedure)', 'Fluoroscopy (procedure)'
        ], size=5000),
        'patient_source': np.random.choice(['accident & emergency department', 'gp direct access service', 'inpatient', 'outpatient', 'elective admission'], size=5000)
    })
else:
    print(f"Detected {len(csv_files)} files to load.")

    # ----------- Canonical 23-column schema from the SQL query ---------------
    expected_cols = [
        'icb_code','icb_name','lsoa_code','nhs_region',
        'site_code','site_name','provider_code','provider_name',
        'activity_month','financial_year','financial_month','test_date',
        'age','sex','modality','sub_modality','procedure_name',
        'referral_type','patient_source','cancer_flag','subcancer_flag',
        'referring_org_code','referring_org_name'
    ]

    dfs, meta = [], []
    for fp in csv_files:
        try:
            peek = pd.read_csv(fp, nrows=5)
            if not set(expected_cols).issubset(peek.columns):
                print(f"'{os.path.basename(fp)}' – no header found, reloading with specified names.")
                df_tmp = pd.read_csv(fp, header=None, names=expected_cols, low_memory=False)
            else:
                df_tmp = pd.read_csv(fp, low_memory=False)

            df_tmp = df_tmp.dropna(axis=1, how='all')
            df_tmp.columns = df_tmp.columns.str.strip().str.lower()
            
            # Align to expected schema
            df_tmp = df_tmp[[c for c in expected_cols if c in df_tmp.columns]]
            for col in (set(expected_cols) - set(df_tmp.columns)):
                df_tmp[col] = pd.NA
            df_tmp = df_tmp[expected_cols]
            dfs.append(df_tmp)

            meta.append({
                "file": os.path.basename(fp),
                "rows": len(df_tmp),
                "cols": df_tmp.shape[1],
                "MB": round(df_tmp.memory_usage(deep=True).sum()/1e6, 2)
            })
        except Exception as e:
            print(f"Error loading file {os.path.basename(fp)}: {e}")


    meta_df = pd.DataFrame(meta)
    display(meta_df.style.set_caption("Loaded files – rows / cols / size"))

    if dfs:
        df = pd.concat(dfs, ignore_index=True)
        print(f"\nCombined shape: {df.shape}")
    else:
        print("\nNo dataframes were loaded. Halting script.")
        # Create a dummy df to prevent further errors
        df = pd.DataFrame(columns=expected_cols)

Detected 7 files to load.
'2024_NHS_SW_Somerset_ICB_11X.csv' – no header found, reloading with specified names.
'2024_NHS_SW_Cornwall_ICB_11N.csv' – no header found, reloading with specified names.
'2024_NHS_SW_Gloucestershire_ICB_11M.csv' – no header found, reloading with specified names.
'2024_NHS_SW_Dorset_ICB_11J.csv' – no header found, reloading with specified names.
'2024_NHS_SW_Devon_ICB_15N.csv' – no header found, reloading with specified names.
'2024_NHS_SW_BSW_ICB_92G.csv' – no header found, reloading with specified names.
'2024_NHS_SW_BNSSG_ICB_15C.csv' – no header found, reloading with specified names.


Unnamed: 0,file,rows,cols,MB
0,2024_NHS_SW_Somerset_ICB_11X.csv,481843,23,695.27
1,2024_NHS_SW_Cornwall_ICB_11N.csv,512857,23,748.35
2,2024_NHS_SW_Gloucestershire_ICB_11M.csv,229186,23,335.49
3,2024_NHS_SW_Dorset_ICB_11J.csv,525091,23,762.79
4,2024_NHS_SW_Devon_ICB_15N.csv,676563,23,991.61
5,2024_NHS_SW_BSW_ICB_92G.csv,741719,23,1106.54
6,2024_NHS_SW_BNSSG_ICB_15C.csv,821993,23,1240.98



Combined shape: (3989252, 23)


In [17]:
# 1 – Schema checks & quick profile
print("Initial dataframe shape:", df.shape)
print("\nColumn summary:")
print(df.dtypes.value_counts())

# confirm the critical columns exist
must_have = ["age", "modality", "patient_source"]
missing   = [c for c in must_have if c not in df.columns]
if missing:
    raise KeyError(f"Missing expected column(s): {missing}")

# peek at the first five rows
display(df.head())


Initial dataframe shape: (3989252, 23)

Column summary:
object     20
int64       2
float64     1
Name: count, dtype: int64


Unnamed: 0,icb_code,icb_name,lsoa_code,nhs_region,site_code,site_name,provider_code,provider_name,activity_month,financial_year,financial_month,test_date,age,sex,modality,sub_modality,procedure_name,referral_type,patient_source,cancer_flag,subcancer_flag,referring_org_code,referring_org_name
0,11X,NHS SOMERSET ICB - 11X,E01029163,Abdominal structure (body structure),RH504,GREENFIELDS DAY CENTRE (RH504),RH5,SOMERSET NHS FOUNDATION TRUST,202405,2024/25,202502,2024-05-11 00:00:00.0000000,56.0,Male,Diagnostic ultrasonography (procedure),,Ultrasound scan of upper abdomen (procedure),GP,GP Direct Access,Suspected Ovarian Cancer,US Upper abdomen,RH5,SOMERSET NHS FOUNDATION TRUST (RH5)
1,11X,NHS SOMERSET ICB - 11X,E01029116,Limb structure (body structure),RA7C2,WESTON GENERAL HOSPITAL (RA7C2),RA7,UNIVERSITY HOSPITALS BRISTOL AND WESTON NHS FO...,202405,2024/25,202502,2024-05-13 00:00:00.0000000,72.0,Female,Plain radiography (procedure),,X-ray of left knee (procedure) (427019001),Consultant,Admitted Patient Care - Inpatient (this Health...,,,RA7,UNIVERSITY HOSPITALS BRISTOL AND WESTON NHS FO...
2,11X,NHS SOMERSET ICB - 11X,E01029276,Thoracic structure (body structure),RH5A8,MUSGROVE PARK HOSPITAL (RH5A8),RH5,SOMERSET NHS FOUNDATION TRUST,202402,2023/24,202411,2024-02-27 00:00:00.0000000,65.0,Male,Plain radiography (procedure),,Plain chest X-ray (procedure) (399208008),GP,GP Direct Access,Suspected Lung Cancer - Chest X-ray,XR Chest,L85023,ST JAMES (L85023)
3,11X,NHS SOMERSET ICB - 11X,E01029052,Limb structure (body structure),RA7C2,WESTON GENERAL HOSPITAL (RA7C2),RA7,UNIVERSITY HOSPITALS BRISTOL AND WESTON NHS FO...,202404,2024/25,202501,2024-04-21 00:00:00.0000000,73.0,Male,Plain radiography (procedure),,Radiography of elbow (procedure) (76913009),Consultant,Accident and Emergency Department (this Health...,,,RA7,UNIVERSITY HOSPITALS BRISTOL AND WESTON NHS FO...
4,11X,NHS SOMERSET ICB - 11X,E01029231,,RH504,GREENFIELDS DAY CENTRE (RH504),RH5,SOMERSET NHS FOUNDATION TRUST,202412,2024/25,202509,2024-12-05 00:00:00.0000000,73.0,Female,,,Plain X-ray of right hip (procedure),GP,GP Direct Access,,,RH5,SOMERSET NHS FOUNDATION TRUST (RH5)


In [33]:
# 2 – Apply ONS-style age bands (matching predefined 'bands' structure)
import numpy as np

# Define cut points and labels
age_bins = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,np.inf]
age_labels = [
    "0-4","5-9","10-14","15-19","20-24","25-29","30-34","35-39",
    "40-44","45-49","50-54","55-59","60-64","65-69","70-74",
    "75-79","80-84","85+"
]

# Ensure age is clean
df["age"] = df["age"].astype("Int64")

# Apply banding
df["age_band"] = pd.cut(df["age"], bins=age_bins, labels=age_labels, right=True)

# Confirm result
print("\nAfter applying ONS-style age bands:")
print(df[["age", "age_band"]].head())



After applying ONS-style age bands:
   age age_band
0   56    55-59
1   72    70-74
2   65    60-64
3   73    70-74
4   73    70-74


In [34]:
# 3 – Modality mapping
MODALITY_PATTERNS = {
    "computerized axial tomography": "CT",
    "magnetic resonance imaging":    "MRI",
    "endoscopy":                     "Endoscopy",
}

def map_mod(text):
    text = str(text).lower()
    for pat, short in MODALITY_PATTERNS.items():
        if pat in text:
            return short
    return np.nan            # mark other modalities as NaN

df["modality_clean"] = df["modality"].apply(map_mod)

print("\nModality distribution:")
display(df["modality_clean"].value_counts(dropna=False))



Modality distribution:


modality_clean
NaN          3009709
CT            647476
MRI           324893
Endoscopy       7174
Name: count, dtype: int64

In [35]:
# 4 – Patient-source bucket
PAT_SRC_PATTERNS = {
    r"acc?ident.*emerg|^emergency.*|999|ambulance": "Emergency/A&E",
    r"gp .*access|gp direct|elective|outpatient":   "Elective/GP",
    r"inpatient":                                   "Inpatient"
}

def bucket_ps(text):
    text = str(text).lower()
    for pat, lab in PAT_SRC_PATTERNS.items():
        if re.search(pat, text):
            return lab
    return "Other"

df["ps_bucket"] = df["patient_source"].apply(bucket_ps)

print("\nCross-tab – modality_clean × ps_bucket")
display(pd.crosstab(df["modality_clean"], df["ps_bucket"]))



Cross-tab – modality_clean × ps_bucket


ps_bucket,Elective/GP,Emergency/A&E,Inpatient,Other
modality_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CT,315392,186058,139260,6766
Endoscopy,1069,23,2730,3352
MRI,264362,10952,43532,6047


In [36]:
# 5 – Boolean flags
df["cancer_flag"] = df["cancer_flag"].fillna(False).astype(bool)
df["is_ct"]       = (df["modality_clean"] == "CT")
df["is_mri"]      = (df["modality_clean"] == "MRI")
df["is_endo"]     = (df["modality_clean"] == "Endoscopy")

print("\nHead with new flags:")
display(df[["modality_clean", "cancer_flag", "ps_bucket"]].head())



Head with new flags:


Unnamed: 0,modality_clean,cancer_flag,ps_bucket
0,,True,Elective/GP
1,,False,Inpatient
2,,True,Elective/GP
3,,False,Emergency/A&E
4,,False,Elective/GP


In [37]:
# 6 – Split by modality
ct_df   = df[df["is_ct"]].copy()
mri_df  = df[df["is_mri"]].copy()
endo_df = df[df["is_endo"]].copy()

for name, d in [("CT", ct_df), ("MRI", mri_df), ("Endoscopy", endo_df)]:
    print(f"{name:9s}  rows: {d.shape[0]:,}")
    display(d.head(3))


CT         rows: 647,476


Unnamed: 0,icb_code,icb_name,lsoa_code,nhs_region,site_code,site_name,provider_code,provider_name,activity_month,financial_year,financial_month,test_date,age,sex,modality,sub_modality,procedure_name,referral_type,patient_source,cancer_flag,subcancer_flag,referring_org_code,referring_org_name,age_band,age_mid,modality_clean,ps_bucket,is_ct,is_mri,is_endo
15,11X,NHS SOMERSET ICB - 11X,E01029112,Head structure (body structure),RH5A8,MUSGROVE PARK HOSPITAL (RH5A8),RH5,SOMERSET NHS FOUNDATION TRUST,202403,2023/24,202412,2024-03-15 00:00:00.0000000,41,Male,Computerized axial tomography (procedure),,Computed tomography of entire head (procedure)...,Consultant,Accident and Emergency Department (this Health...,False,,RH5,SOMERSET NHS FOUNDATION TRUST (RH5),40-44,,CT,Emergency/A&E,True,False,False
16,11X,NHS SOMERSET ICB - 11X,E01029135,Head structure (body structure),RH5A8,MUSGROVE PARK HOSPITAL (RH5A8),RH5,SOMERSET NHS FOUNDATION TRUST,202411,2024/25,202508,2024-11-28 00:00:00.0000000,44,Female,Computerized axial tomography (procedure),,Computerized axial tomography of brain with ra...,Consultant,Admitted Patient Care - Inpatient (this Health...,False,,RH5,SOMERSET NHS FOUNDATION TRUST (RH5),40-44,,CT,Inpatient,True,False,False
17,11X,NHS SOMERSET ICB - 11X,E01029237,Head structure (body structure),RH504,GREENFIELDS DAY CENTRE (RH504),RH5,SOMERSET NHS FOUNDATION TRUST,202409,2024/25,202506,2024-09-27 00:00:00.0000000,52,Male,Computerized axial tomography (procedure),,Computed tomography of entire head (procedure),Consultant,Accident and Emergency Department (this Health...,False,,RH5,SOMERSET NHS FOUNDATION TRUST (RH5),50-54,,CT,Emergency/A&E,True,False,False


MRI        rows: 324,893


Unnamed: 0,icb_code,icb_name,lsoa_code,nhs_region,site_code,site_name,provider_code,provider_name,activity_month,financial_year,financial_month,test_date,age,sex,modality,sub_modality,procedure_name,referral_type,patient_source,cancer_flag,subcancer_flag,referring_org_code,referring_org_name,age_band,age_mid,modality_clean,ps_bucket,is_ct,is_mri,is_endo
8,11X,NHS SOMERSET ICB - 11X,E01029036,,NTPH1,PRACTICE PLUS GROUP HOSPITAL - SHEPTON MALLET ...,NTP,PRACTICE PLUS GROUP HOLDINGS,202410,2024/25,202507,2024-10-09 00:00:00.0000000,58,Male,Magnetic resonance imaging (procedure),,Magnetic resonance imaging of lumbar spine (pr...,Not known,GP Direct Access,False,,X99999,REFERRING ORGANISATION CODE NOT KNOWN (X99999),55-59,,MRI,Elective/GP,False,True,False
29,11X,NHS SOMERSET ICB - 11X,E01014470,Head structure (body structure),RH504,GREENFIELDS DAY CENTRE (RH504),RH5,SOMERSET NHS FOUNDATION TRUST,202403,2023/24,202412,2024-03-26 00:00:00.0000000,29,Female,Magnetic resonance imaging (procedure),,Magnetic resonance imaging of head (procedure),Consultant,Outpatient (this Health Care Provider),True,MRI Head,RH5,SOMERSET NHS FOUNDATION TRUST (RH5),25-29,,MRI,Elective/GP,False,True,False
43,11X,NHS SOMERSET ICB - 11X,E01032629,Limb structure (body structure),RH5A8,MUSGROVE PARK HOSPITAL (RH5A8),RH5,SOMERSET NHS FOUNDATION TRUST,202404,2024/25,202501,2024-04-22 00:00:00.0000000,37,Female,Magnetic resonance imaging (procedure),,Magnetic resonance imaging of lumbar and sacra...,Consultant,Accident and Emergency Department (this Health...,False,,RH5,SOMERSET NHS FOUNDATION TRUST (RH5),35-39,,MRI,Emergency/A&E,False,True,False


Endoscopy  rows: 7,174


Unnamed: 0,icb_code,icb_name,lsoa_code,nhs_region,site_code,site_name,provider_code,provider_name,activity_month,financial_year,financial_month,test_date,age,sex,modality,sub_modality,procedure_name,referral_type,patient_source,cancer_flag,subcancer_flag,referring_org_code,referring_org_name,age_band,age_mid,modality_clean,ps_bucket,is_ct,is_mri,is_endo
1463,11X,NHS SOMERSET ICB - 11X,E01029316,Abdominal structure (body structure),RH5A8,MUSGROVE PARK HOSPITAL (RH5A8),RH5,SOMERSET NHS FOUNDATION TRUST,202401,2023/24,202410,2024-01-23 00:00:00.0000000,60,Male,Endoscopy (procedure),,Endoscopic retrograde cholangiopancreatography...,Consultant,Admitted Patient Care - Inpatient (this Health...,False,,RH5,SOMERSET NHS FOUNDATION TRUST (RH5),55-59,,Endoscopy,Inpatient,False,False,True
1808,11X,NHS SOMERSET ICB - 11X,E01029134,Abdominal structure (body structure),RA7C2,WESTON GENERAL HOSPITAL (RA7C2),RA7,UNIVERSITY HOSPITALS BRISTOL AND WESTON NHS FO...,202402,2023/24,202411,2024-02-01 00:00:00.0000000,72,Male,Endoscopy (procedure),,Endoscopic retrograde cholangiopancreatography...,Consultant,Admitted Patient Care - Inpatient (this Health...,False,,RA7,UNIVERSITY HOSPITALS BRISTOL AND WESTON NHS FO...,70-74,,Endoscopy,Inpatient,False,False,True
2855,11X,NHS SOMERSET ICB - 11X,E01029331,Abdominal structure (body structure),RH5A8,MUSGROVE PARK HOSPITAL (RH5A8),RH5,SOMERSET NHS FOUNDATION TRUST,202408,2024/25,202505,2024-08-23 00:00:00.0000000,51,Male,Endoscopy (procedure),,Endoscopic retrograde cholangiopancreatography...,Consultant,Admitted Patient Care - Inpatient (this Health...,False,,RH5,SOMERSET NHS FOUNDATION TRUST (RH5),50-54,,Endoscopy,Inpatient,False,False,True


In [38]:
# 7 – Persist to disk (CSV version, saving to processed/)
SAVE_FILES = True
if SAVE_FILES:
    out_dir = "/Users/rosstaylor/Downloads/Research Project/Code Folder/nhs-diagnostics-dids-eda/nhs-dids-explorer/data/processed"
    os.makedirs(out_dir, exist_ok=True)
    
    ct_df.to_csv(os.path.join(out_dir, "ct_master.csv"), index=False)
    mri_df.to_csv(os.path.join(out_dir, "mri_master.csv"), index=False)
    endo_df.to_csv(os.path.join(out_dir, "endo_master.csv"), index=False)
    
    print("CSV files saved in:", out_dir)


CSV files saved in: /Users/rosstaylor/Downloads/Research Project/Code Folder/nhs-diagnostics-dids-eda/nhs-dids-explorer/data/processed
