In [1]:
# check_missing_lsoa.py

In [2]:
import pandas as pd

lsoa_imd = set(pd.read_csv("imd_london.csv")["LSOA_Code"])
lsoa_static = set(pd.read_csv("feature_static_london.csv")["LSOA_Code"])
lsoa_crime = set(pd.read_csv("crime_rolling6m.csv")["LSOA_Code"])  # or monthly_wide

print("IMD LSOA:", len(lsoa_imd))
print("Static LSOA:", len(lsoa_static))
print("Crime LSOA:", len(lsoa_crime))

print("Missing in static (vs IMD):", len(lsoa_imd - lsoa_static))
print("Missing in crime  (vs IMD):", len(lsoa_imd - lsoa_crime))

# list a few codes to inspect
print("Examples missing in static:", list(lsoa_imd - lsoa_static)[:10])
print("Examples missing in crime :", list(lsoa_imd - lsoa_crime)[:10])

IMD LSOA: 4835
Static LSOA: 4659
Crime LSOA: 4653
Missing in static (vs IMD): 176
Missing in crime  (vs IMD): 182
Examples missing in static: ['E01004689', 'E01000940', 'E01003475', 'E01033085', 'E01032778', 'E01032766', 'E01001874', 'E01033725', 'E01002406', 'E01033585']
Examples missing in crime : ['E01004689', 'E01000940', 'E01003475', 'E01033085', 'E01032778', 'E01032766', 'E01001874', 'E01033725', 'E01002406', 'E01033585']


In [3]:
import pandas as pd

lsoa_imd = set(pd.read_csv("imd_london.csv", dtype=str)["LSOA_Code"])
lsoa_static = set(pd.read_csv("feature_static_london.csv", dtype=str)["LSOA_Code"])
lsoa_crime = set(pd.read_csv("crime_rolling6m.csv", dtype=str)["LSOA_Code"])  # or monthly

# Map LSOA -> Borough using IMD
imd_raw = pd.read_csv("IMD2019.csv", dtype=str)
imd_map = imd_raw.rename(columns={
    "LSOA code (2011)": "LSOA_Code",
    "Local Authority District name (2019)": "LAD_Name"
})[["LSOA_Code","LAD_Name"]]

missing_static = imd_map[imd_map["LSOA_Code"].isin(lsoa_imd - lsoa_static)]
missing_crime  = imd_map[imd_map["LSOA_Code"].isin(lsoa_imd - lsoa_crime)]

print("Top boroughs with static missing:")
print(missing_static["LAD_Name"].value_counts().head(10))

print("\nTop boroughs with crime missing:")
print(missing_crime["LAD_Name"].value_counts().head(10))


Top boroughs with static missing:
LAD_Name
Tower Hamlets    25
Westminster      13
Greenwich        12
Newham           12
Southwark        12
Camden            9
Hackney           8
Hillingdon        8
Hounslow          7
Brent             6
Name: count, dtype: int64

Top boroughs with crime missing:
LAD_Name
Tower Hamlets           25
Westminster             13
Greenwich               12
Southwark               12
Newham                  12
Camden                   9
Hackney                  8
Hillingdon               8
Hounslow                 7
Barking and Dagenham     6
Name: count, dtype: int64


In [4]:
import pandas as pd
import numpy as np

# read the wide original once more and DO NOT drop zeros when melting
wide = pd.read_csv("MPS LSOA Level Crime_2010.4.1_2023.3.31.csv", low_memory=False)
wide.columns = wide.columns.str.strip().str.replace(" ","_")

id_cols = ["LSOA_Code","LSOA_Name","Borough","Major_Category","Minor_Category"]
month_cols = [c for c in wide.columns if c.isdigit() and len(c)==6]

# melt without dropping zeros
long_all = wide.melt(id_vars=id_cols, value_vars=month_cols,
                     var_name="yyyymm", value_name="crime_cnt")
long_all["date"] = pd.to_datetime(long_all["yyyymm"]+"01", format="%Y%m%d")
long_all["crime_cnt"] = long_all["crime_cnt"].fillna(0).astype("int32")

# keep only London LSOA (lookup you already have)
london = set(pd.read_csv("london_lsoa_lookup.csv", dtype=str)["LSOA_Code"])
long_all = long_all[long_all["LSOA_Code"].isin(london)]

# if some LSOA now appear (but were missing before), they were "all-zero" areas
print("LSOA in IMD but absent previously, present now:",
      len((lsoa_imd - lsoa_crime) & set(long_all["LSOA_Code"])))

LSOA in IMD but absent previously, present now: 0
