In [1]:
# build_static_features.py

In [2]:
from pathlib import Path
import pandas as pd

IMD_CSV =  "IMD2019.csv"

# 1 | load IMD 
imd = pd.read_csv(IMD_CSV, dtype=str)

# explicit column names from your screenshot
lsoa_col = "LSOA code (2011)"
lad_col  = "Local Authority District code (2019)"

# 2 | list of London LAD codes
london_lad = [f"E090000{str(i).zfill(2)}" for i in range(1, 34)]

imd_london = imd[imd[lad_col].isin(london_lad)].copy()
print(f"IMD rows for London: {len(imd_london):,}")

# 3 | create and save lookup
lookup_path = "london_lsoa_lookup.csv"
(
    imd_london[[lsoa_col]]
    .rename(columns={lsoa_col: "LSOA_Code"})
    .drop_duplicates()
    .to_csv(lookup_path, index=False)
)
print(f"{imd_london[lsoa_col].nunique():,} unique LSOA codes saved ➜ {lookup_path}")

IMD rows for London: 4,835
4,835 unique LSOA codes saved ➜ london_lsoa_lookup.csv


In [3]:
import re
import pandas as pd
from typing import Optional

LOOKUP_CSV = "london_lsoa_lookup.csv"
AGE_CSV    = "census2021-ts007a-lsoa-Age by five-year age bands.csv" 

london = pd.read_csv(LOOKUP_CSV, dtype=str)["LSOA_Code"]
age    = pd.read_csv(AGE_CSV, dtype=str).rename(columns={"geography code": "LSOA_Code"})
age    = age[age["LSOA_Code"].isin(london)].copy()

# 1 | identify age-band columns
age_band_cols = [c for c in age.columns if c.startswith("Age: Aged")]

def band_to_group(col: str) -> Optional[str]:
    if "Aged 4 years and under" in col:
        low = 0
    elif "Aged 85 years and over" in col:
        low = 85
    else:
        m = re.search(r"Aged (\d+) to (\d+) years", col)
        if not m:
            return None
        low = int(m.group(1))
    if low <= 19:
        return "child"
    elif low <= 64:
        return "adult"
    else:
        return "elderly"

group_map = {col: band_to_group(col) for col in age_band_cols}

child_cols   = [c for c, g in group_map.items() if g == "child"]
adult_cols   = [c for c, g in group_map.items() if g == "adult"]
elderly_cols = [c for c, g in group_map.items() if g == "elderly"]

print("Found columns:",
      f"\n  child   {len(child_cols)} → {child_cols[:3]} …",
      f"\n  adult   {len(adult_cols)} → {adult_cols[:3]} …",
      f"\n  elderly {len(elderly_cols)} → {elderly_cols[:3]} …",
)

# 2 | numeric conversion
numeric_cols = child_cols + adult_cols + elderly_cols
age[numeric_cols] = age[numeric_cols].astype(float)

total_col = [c for c in age.columns if c.startswith("Age: Total")][0]
age[total_col] = age[total_col].astype(float)

# 3 | share calculation
age["pct_child"]   = age[child_cols].sum(axis=1)   / age[total_col]
age["pct_adult"]   = age[adult_cols].sum(axis=1)   / age[total_col]
age["pct_elderly"] = age[elderly_cols].sum(axis=1) / age[total_col]

age_out = "census_age_london.csv"
age[["LSOA_Code", "pct_child", "pct_adult", "pct_elderly"]].to_csv(age_out, index=False)
print("✓ age shares saved →", age_out)

Found columns: 
  child   4 → ['Age: Aged 4 years and under', 'Age: Aged 5 to 9 years', 'Age: Aged 10 to 14 years'] … 
  adult   9 → ['Age: Aged 20 to 24 years', 'Age: Aged 25 to 29 years', 'Age: Aged 30 to 34 years'] … 
  elderly 5 → ['Age: Aged 65 to 69 years', 'Age: Aged 70 to 74 years', 'Age: Aged 75 to 79 years'] …
✓ age shares saved → census_age_london.csv


In [4]:
LOOKUP_CSV = "london_lsoa_lookup.csv"
POPD_CSV   = "census2021-ts006-lsoa-Population density.csv"

# 1 | load lookup
london_lsoa = pd.read_csv(LOOKUP_CSV, dtype=str)["LSOA_Code"]

# 2 | load population density
popd = pd.read_csv(POPD_CSV, dtype=str)

# rename key columns
popd = popd.rename(columns={
    "geography code": "LSOA_Code",
    "Population Density: Persons per square kilometre; measures: Value": "pop_density"
})

# 3 | filter to London
popd_london = popd[popd["LSOA_Code"].isin(london_lsoa)].copy()

# convert density to numeric (optional)
popd_london["pop_density"] = popd_london["pop_density"].astype(float)

# 4 | save
out_file = "census_popdensity_london.csv"
popd_london[["LSOA_Code", "pop_density"]].to_csv(out_file, index=False)

print(f"✓ population density saved → {out_file}  ({len(popd_london):,} rows)")

✓ population density saved → census_popdensity_london.csv  (4,659 rows)


In [5]:
import pandas as pd

LOOKUP_CSV = "london_lsoa_lookup.csv"
SEX_CSV    = "census2021-ts008-lsoa-sex.csv"

# 1 | lookup
london = pd.read_csv(LOOKUP_CSV, dtype=str)["LSOA_Code"]

# 2 | load sex table
sex = pd.read_csv(SEX_CSV, dtype=str).rename(columns={
    "geography code": "LSOA_Code",
    "Sex: All persons; measures: Value": "pop_total",
    "Sex: Male; measures: Value": "pop_male",
    "Sex: Female; measures: Value": "pop_female",
})

# 3 | keep London + numeric conversion
sex = sex[sex["LSOA_Code"].isin(london)].copy()
sex[["pop_total", "pop_male", "pop_female"]] = sex[["pop_total", "pop_male", "pop_female"]].astype(float)

sex["pct_male"]   = sex["pop_male"]   / sex["pop_total"]

# 4 | save
out_file = "census_sex_london.csv"
sex[["LSOA_Code", "pct_male"]].to_csv(out_file, index=False)

print(f"✓ sex shares saved → {out_file}  ({len(sex):,} rows)")

✓ sex shares saved → census_sex_london.csv  (4,659 rows)


In [6]:
LOOKUP_CSV = "london_lsoa_lookup.csv"
ETHNIC_CSV = "census2021-ts021-lsoa-Ethnic group.csv"

# 1 | London LSOA list
london = pd.read_csv(LOOKUP_CSV, dtype=str)["LSOA_Code"]

# 2 | load ethnic table
eth = pd.read_csv(ETHNIC_CSV, dtype=str).rename(columns={
    "geography code": "LSOA_Code",
    "Ethnic group: Total: All usual residents": "pop_total",
    "Ethnic group: White": "white",
    "Ethnic group: Black, Black British, Black Welsh, Caribbean or African": "black",
    "Ethnic group: Asian, Asian British or Asian Welsh": "asian",
    "Ethnic group: Mixed or Multiple ethnic groups": "mixed",
    "Ethnic group: Other ethnic group": "other",
})

# 3 | filter London, numeric conversion
eth = eth[eth["LSOA_Code"].isin(london)].copy()
cols = ["pop_total", "white", "black", "asian", "mixed", "other"]
eth[cols] = eth[cols].astype(float)

# 4 | share calculation
eth["pct_white"] = eth["white"] / eth["pop_total"]
eth["pct_black"] = eth["black"] / eth["pop_total"]
eth["pct_asian"] = eth["asian"] / eth["pop_total"]
eth["pct_mixed"] = eth["mixed"] / eth["pop_total"]
eth["pct_other"] = eth["other"] / eth["pop_total"]

out_file = "census_ethnic_london.csv"
eth[["LSOA_Code",
     "pct_white","pct_black","pct_asian","pct_mixed","pct_other"]
   ].to_csv(out_file, index=False)

print(f"✓ ethnic‑group shares saved → {out_file}  ({len(eth):,} rows)")

✓ ethnic‑group shares saved → census_ethnic_london.csv  (4,659 rows)


In [7]:
IMD_CSV = "IMD2019.csv"

# ------------ 1 | load IMD ---------------------
imd = pd.read_csv(IMD_CSV, dtype=str)

lsoa_col = "LSOA code (2011)"
lad_col  = "Local Authority District code (2019)"

# ------------ 2 | filter London ----------------
london_lad_codes = [f"E090000{str(i).zfill(2)}" for i in range(1, 34)]
imd = imd[imd[lad_col].isin(london_lad_codes)].copy()

# ------------ 3 | select rank columns ----------
rank_map = {
    "income_rank":     "Income Rank (where 1 is most deprived)",
    "employment_rank": "Employment Rank (where 1 is most deprived)",
    "education_rank":  "Education, Skills and Training Rank (where 1 is most deprived)",
    "health_rank":     "Health Deprivation and Disability Rank (where 1 is most deprived)",
    "barriers_rank":   "Barriers to Housing and Services Rank (where 1 is most deprived)",
    "livingenv_rank":  "Living Environment Rank (where 1 is most deprived)",
}

imd_sub = imd[[lsoa_col, *rank_map.values()]].rename(columns={lsoa_col: "LSOA_Code", **{
    v: k for k, v in rank_map.items()
}})

# convert to int
for c in rank_map.keys():
    imd_sub[c] = imd_sub[c].astype(int)

max_rank = imd_sub["income_rank"].max()   # 32 844 for England

# ------------ 4 | rank → percentile ------------
for dom in ["income", "employment", "education", "health", "barriers", "livingenv"]:
    imd_sub[f"{dom}_pct"] = 1 - (imd_sub[f"{dom}_rank"] - 1) / (max_rank - 1)

# keep only percentile columns
keep_cols = ["LSOA_Code"] + [f"{d}_pct" for d in ["income","employment","education","health","barriers","livingenv"]]
imd_out = "imd_london.csv"
imd_sub[keep_cols].to_csv(imd_out, index=False)

print(f"✓ IMD features saved → {imd_out}  ({imd_sub.shape[0]:,} rows)")

✓ IMD features saved → imd_london.csv  (4,835 rows)


In [8]:
import pandas as pd

static = (
    pd.read_csv("census_age_london.csv")
      .merge(pd.read_csv("census_popdensity_london.csv"), on="LSOA_Code")
      .merge(pd.read_csv("census_sex_london.csv"),        on="LSOA_Code")
      .merge(pd.read_csv("census_ethnic_london.csv"),     on="LSOA_Code")
      .merge(pd.read_csv("imd_london.csv"),               on="LSOA_Code")
)

static.to_csv("feature_static_london.csv", index=False)
print("Static feature matrix saved → feature_static_london.csv  ",
      f"({static.shape[0]:,} rows × {static.shape[1]} cols)")

Static feature matrix saved → feature_static_london.csv   (4,659 rows × 17 cols)
