In [None]:
# ------------------------------------------------------------------
# Step 0 – Purpose
# ------------------------------------------------------------------
# Inspect all files in a given folder to:
# - Identify file types
# - Attempt to load them with pandas
# - Print file name, shape, column names, and header preview

In [None]:
# ------------------------------------------------------------------
# Step 1 – Imports
# ------------------------------------------------------------------
import os
import pandas as pd
import geopandas as gpd
import fiona

In [None]:
# ------------------------------------------------------------------
# Step 1 – Define Root Data Directory
# ------------------------------------------------------------------
data_dir = "/Users/rosstaylor/Downloads/Code Repositories/REACH Map (NHS SW)/GitHub Repo/REACH-Map-NHS-SW/data/raw"


In [16]:
# ------------------------------------------------------------------
# Step 2 – Inspection Function (Handles .csv, .xlsx, .parquet, .shp, .gpkg)
# ------------------------------------------------------------------
def inspect_file(filepath):
    filetype = os.path.splitext(filepath)[-1].lower()
    try:
        if filetype == '.csv':
            df = pd.read_csv(filepath, nrows=5)
        elif filetype in ['.xlsx', '.xls']:
            df = pd.read_excel(filepath, nrows=5)
        elif filetype == '.parquet':
            df = pd.read_parquet(filepath)
        elif filetype == '.shp':
            df = gpd.read_file(filepath)
        elif filetype == '.gpkg':
            layers = fiona.listlayers(filepath)
            df = gpd.read_file(filepath, layer=layers[0])
        else:
            return None, None, f"Unsupported type: {filetype}"
        return list(df.columns), df.shape[1], None
    except Exception as e:
        return None, None, str(e)

In [17]:
# ------------------------------------------------------------------
# Step 3 – Recursively Scan and Summarise Files
# ------------------------------------------------------------------
summary = []

for root, dirs, files in os.walk(data_dir):
    for filename in files:
        filepath = os.path.join(root, filename)
        filetype = os.path.splitext(filename)[-1].lower()

        if filetype in [".csv", ".xlsx", ".xls", ".parquet", ".shp", ".gpkg"]:
            columns, n_columns, error = inspect_file(filepath)
            file_info = {
                "relative_path": os.path.relpath(filepath, data_dir),
                "filename": filename,
                "filetype": filetype,
                "size_MB": round(os.path.getsize(filepath) / (1024 * 1024), 2),
                "n_columns": n_columns,
                "columns": columns,
                "error": error
            }
            summary.append(file_info)
