In [1]:
import pandas as pd
import re

# --- 1. Set file path ---
file_path = "/Users/rosstaylor/Downloads/Research Project/Code Folder/nhs-diagnostics-dids-eda/nhs-dids-explorer/data/raw/TS007-2021-3-filtered-2025-07-03T07-56-29Z.csv"

# --- 2. Load file ---
df = pd.read_csv(file_path)
print("Loaded data:")
print(df.head(3))

Loaded data:
  NHS England regions Code NHS England regions  Age (101 categories) Code  \
0                E40000003              London                          0   
1                E40000003              London                          1   
2                E40000003              London                          2   

  Age (101 categories)  Observation  
0    Aged under 1 year       104232  
1          Aged 1 year       107140  
2         Aged 2 years       105690  


In [5]:
# --- 3. Filter for South West region ---
df_sw = df[df["NHS England regions"] == "South West"].copy()
print(f"\nFiltered to South West: {len(df_sw)} rows")
print(df_sw.head(3))


Filtered to South West: 101 rows
    NHS England regions Code NHS England regions  Age (101 categories) Code  \
202                E40000006          South West                          0   
203                E40000006          South West                          1   
204                E40000006          South West                          2   

    Age (101 categories)  Observation  
202    Aged under 1 year        51086  
203          Aged 1 year        53388  
204         Aged 2 years        55491  


In [7]:
# --- 4. Convert age labels to integers ---
def extract_age(age_str):
    if "under 1" in age_str:
        return 0
    elif "90 years and over" in age_str:
        return 90
    else:
        match = re.search(r"(\d+)", age_str)
        return int(match.group(1)) if match else None

df_sw["age"] = df_sw["Age (101 categories)"].apply(extract_age)

In [9]:
# --- 5. Quick check on age conversion ---
print("\nAge conversion check:")
print(df_sw[["Age (101 categories)", "age"]].drop_duplicates().head(10))



Age conversion check:
    Age (101 categories)  age
202    Aged under 1 year    0
203          Aged 1 year    1
204         Aged 2 years    2
205         Aged 3 years    3
206         Aged 4 years    4
207         Aged 5 years    5
208         Aged 6 years    6
209         Aged 7 years    7
210         Aged 8 years    8
211         Aged 9 years    9


In [13]:
# --- 6. Group by age (long format) ---
df_out = df_sw.groupby("age")["Observation"].sum().reset_index()
df_out.columns = ["age", "population"]

# --- 7. Final shape check ---
print(f"\nFinal long-format table shape: {df_out.shape}")
print(df_out.head(10))



Final long-format table shape: (101, 2)
   age  population
0    0       51086
1    1       53388
2    2       55491
3    3       56712
4    4       58224
5    5       60677
6    6       60233
7    7       61218
8    8       64003
9    9       65397


In [17]:
import os

# --- 8. Apply 0.7% annual growth from 2021 to 2024 ---
annual_growth_rate = 0.007
years = 2024 - 2021
growth_factor = (1 + annual_growth_rate) ** years

df_out["population_2024"] = (df_out["population"] * growth_factor).round().astype(int)

# Define export path
export_path = "/Users/rosstaylor/Downloads/Research Project/Code Folder/nhs-diagnostics-dids-eda/nhs-dids-explorer/data/processed"

# Ensure the directory exists
os.makedirs(export_path, exist_ok=True)

# --- 9. Export original 2021 data ---
df_2021 = df_out[["age", "population"]].copy()
df_2021.to_csv(os.path.join(export_path, "south_west_population_2021.csv"), index=False)
print("Exported: south_west_population_2021.csv")

# --- 10. Export projected 2024 data ---
df_2024 = df_out[["age", "population_2024"]].rename(columns={"population_2024": "population"})
df_2024.to_csv(os.path.join(export_path, "south_west_population_2024.csv"), index=False)
print("Exported: south_west_population_2024.csv")


Exported: south_west_population_2021.csv
Exported: south_west_population_2024.csv
