In [None]:
# ------------------------------------------------------------------
# Step 0 – Purpose
# ------------------------------------------------------------------
# Inspect all files in a given folder to:
# - Identify file types
# - Attempt to load them with pandas
# - Print file name, shape, column names, and header preview

In [5]:
# ------------------------------------------------------------------
# Step 0 – Imports
# ------------------------------------------------------------------
import os
import pandas as pd
import geopandas as gpd
import fiona

In [15]:
# ------------------------------------------------------------------
# Step 1 – Define Root Data Directory
# ------------------------------------------------------------------
data_dir = "/Users/rosstaylor/Downloads/Research Project/Code Folder/diagnostic-modality-demand/diagnostic-modality-demand/data/raw"


In [16]:
# ------------------------------------------------------------------
# Step 2 – Inspection Function (Handles .csv, .xlsx, .parquet, .shp, .gpkg)
# ------------------------------------------------------------------
def inspect_file(filepath):
    filetype = os.path.splitext(filepath)[-1].lower()
    try:
        if filetype == '.csv':
            df = pd.read_csv(filepath, nrows=5)
        elif filetype in ['.xlsx', '.xls']:
            df = pd.read_excel(filepath, nrows=5)
        elif filetype == '.parquet':
            df = pd.read_parquet(filepath)
        elif filetype == '.shp':
            df = gpd.read_file(filepath)
        elif filetype == '.gpkg':
            layers = fiona.listlayers(filepath)
            df = gpd.read_file(filepath, layer=layers[0])
        else:
            return None, None, f"Unsupported type: {filetype}"
        return list(df.columns), df.shape[1], None
    except Exception as e:
        return None, None, str(e)

In [17]:
# ------------------------------------------------------------------
# Step 3 – Recursively Scan and Summarise Files
# ------------------------------------------------------------------
summary = []

for root, dirs, files in os.walk(data_dir):
    for filename in files:
        filepath = os.path.join(root, filename)
        filetype = os.path.splitext(filename)[-1].lower()

        if filetype in [".csv", ".xlsx", ".xls", ".parquet", ".shp", ".gpkg"]:
            columns, n_columns, error = inspect_file(filepath)
            file_info = {
                "relative_path": os.path.relpath(filepath, data_dir),
                "filename": filename,
                "filetype": filetype,
                "size_MB": round(os.path.getsize(filepath) / (1024 * 1024), 2),
                "n_columns": n_columns,
                "columns": columns,
                "error": error
            }
            summary.append(file_info)


In [18]:
# ------------------------------------------------------------------
# Step 4 – Convert to Summary DataFrame and Display (No ace_tools)
# ------------------------------------------------------------------
summary_df = pd.DataFrame(summary)
summary_df = summary_df.sort_values(by="relative_path").reset_index(drop=True)

# Display basic preview
print("\nSummary of Files in /data/raw (including subfolders):")
print(summary_df[["relative_path", "filetype", "size_MB", "n_columns", "error"]].to_string(index=False))

# Optional: save to CSV for later inspection
summary_df.to_csv("raw_file_summary.csv", index=False)



Summary of Files in /data/raw (including subfolders):
                                                     relative_path filetype  size_MB  n_columns error
                                   LSOA_5-year_segment_master.gpkg    .gpkg    68.66         32  None
                              LSOA_continuous_age_female_2024.gpkg    .gpkg    70.80        108  None
                                LSOA_continuous_age_male_2024.gpkg    .gpkg    70.77        108  None
                                   LSOA_continuous_age_master.gpkg    .gpkg    70.82        108  None
                                                  LSOA_to_LSOA.csv     .csv   647.93          5  None
                                                     ct_master.csv     .csv   300.83         30  None
                                                   endo_master.csv     .csv     3.29         30  None
    health_infra/NHS_SW_ Community_Diagnostic_Centres_enriched.csv     .csv     0.01         19  None
    health_infra/NHS_SW_ GP

In [19]:
# ------------------------------------------------------------------
# Step 5 – Display Column Names Per File
# ------------------------------------------------------------------
print("\nColumn Names by File:")

for i, row in summary_df.iterrows():
    print(f"\n{row['relative_path']}")
    
    if row["error"]:
        print(f"   Error reading file: {row['error']}")
    elif row["columns"] is None:
        print("   No column info available.")
    else:
        for col in row["columns"]:
            print(f"   - {col}")



Column Names by File:

LSOA_5-year_segment_master.gpkg
   - ICB23NM
   - ladnm
   - ladcd
   - msoa21nm
   - msoa21cd
   - lsoa21nmw
   - lsoa21cd
   - total_population
   - age_0_4
   - age_5_9
   - age_10_14
   - age_15_19
   - age_20_24
   - age_25_29
   - age_30_34
   - age_35_39
   - age_40_44
   - age_45_49
   - age_50_54
   - age_55_59
   - age_60_64
   - age_65_69
   - age_70_74
   - age_75_79
   - age_80_84
   - age_85_plus
   - lat
   - long
   - bng_e
   - bng_n
   - lsoa21nm
   - geometry

LSOA_continuous_age_female_2024.gpkg
   - lsoa21cd
   - lsoa21nm_x
   - lsoa21nmw
   - bng_e
   - bng_n
   - lat
   - long
   - shape__are
   - shape__len
   - globalid
   - lsoa21nm_y
   - msoa21cd
   - msoa21nm
   - ladcd
   - ladnm
   - lsoa_code
   - lsoa_name
   - 0
   - 1
   - 2
   - 3
   - 4
   - 5
   - 6
   - 7
   - 8
   - 9
   - 10
   - 11
   - 12
   - 13
   - 14
   - 15
   - 16
   - 17
   - 18
   - 19
   - 20
   - 21
   - 22
   - 23
   - 24
   - 25
   - 26
   - 27
   - 28
   - 