# N1 Take-home data challenge
> Okay so first we need the sqlite file in this runtime:

In [None]:
# Colab cell 1: setup + upload db
from google.colab import files
import sqlite3, re, zipfile, requests
from io import BytesIO
import numpy as np, pandas as pd

up = files.upload()  # pick challenge.db
DB_PATH = next(k for k in up.keys() if k.endswith(".db"))
conn = sqlite3.connect(DB_PATH)


# County-level health from 500 Cities + county-level food access**
> turn tract chaos into county rows we can actually merge


In [31]:
access = pd.read_sql_query("""
SELECT FIPS, State, County, PCT_LACCESS_POP15, PCT_LACCESS_LOWI15, PCT_LACCESS_SNAP15, PCT_LACCESS_HHNV15
FROM access
""", conn)
access["FIPS"] = access["FIPS"].astype(str).str.zfill(5)

cities = pd.read_sql_query("""
SELECT TractFIPS, Population2010, DIABETES_CrudePrev, OBESITY_CrudePrev, LPA_CrudePrev, PHLTH_CrudePrev
FROM five_hundred_cities
""", conn)
cities["FIPS"] = cities["TractFIPS"].astype(str).str.slice(0,5)
cities = cities.dropna(subset=["Population2010","DIABETES_CrudePrev","OBESITY_CrudePrev","LPA_CrudePrev"])

w = cities["Population2010"].astype(float)
g = cities.assign(Population2010=w).groupby("FIPS")

county_health = g.apply(lambda x: pd.Series({
  "diabetes_rate": np.average(x["DIABETES_CrudePrev"].astype(float), weights=x["Population2010"]),
  "obesity_rate": np.average(x["OBESITY_CrudePrev"].astype(float), weights=x["Population2010"]),
  "inactivity_rate": np.average(x["LPA_CrudePrev"].astype(float), weights=x["Population2010"]),
  "poor_physical_health": np.average(x["PHLTH_CrudePrev"].astype(float), weights=x["Population2010"]) if x["PHLTH_CrudePrev"].notna().any() else np.nan,
  "pop2010": float(x["Population2010"].sum())
})).reset_index()

county = county_health.merge(access, on="FIPS", how="inner")
county.head()


Unnamed: 0,FIPS,diabetes_rate,obesity_rate,inactivity_rate,poor_physical_health,pop2010,State,County,PCT_LACCESS_POP15,PCT_LACCESS_LOWI15,PCT_LACCESS_SNAP15,PCT_LACCESS_HHNV15
0,10003,14.232654,39.848538,38.391434,16.113002,70851.0,DE,New Castle,22.686807,4.750103,1.950658,1.091094
1,11001,8.712104,26.15014,24.344025,9.334466,601690.0,DC,District of Columbia,2.108658,0.785898,0.400502,0.460118
2,12001,7.428068,29.013784,26.451319,11.48456,124343.0,FL,Alachua,15.227937,6.427962,1.618978,2.064161
3,12009,11.710821,31.840647,32.321369,15.360297,179156.0,FL,Brevard,40.248706,12.312836,4.128732,1.942404
4,12011,10.47115,26.826611,30.162411,13.495619,1207224.0,FL,Broward,9.066691,2.476416,0.787433,0.428416


#  Now we download QCEW annual-by-area zip, keep real counties, compute sector shares


In [32]:
YEAR = 2010
url = f"https://data.bls.gov/cew/data/files/{YEAR}/csv/{YEAR}_annual_by_area.zip"
z = zipfile.ZipFile(BytesIO(requests.get(url, timeout=120).content))
csv_name = next(n for n in z.namelist() if n.lower().endswith(".csv"))

q = pd.read_csv(z.open(csv_name), dtype={"area_fips": str, "industry_code": str}, low_memory=False)
q["area_fips"] = q["area_fips"].astype(str).str.zfill(5)
q = q[q["area_fips"].str.fullmatch(r"\d{5}") & (~q["area_fips"].str.endswith("000"))].copy()  # bye 01000
q["industry_code_norm"] = q["industry_code"].astype(str).str.replace("-", "_", regex=False)
q["annual_avg_emplvl"] = pd.to_numeric(q.get("annual_avg_emplvl", pd.Series(dtype=float)), errors="coerce")
q = q.dropna(subset=["annual_avg_emplvl"])

SECTORS = {
  "31_33":"Manufacturing","48_49":"Transportation","62":"Health Care","44_45":"Retail","72":"Food/Hotels",
  "23":"Construction","54":"Professional/Tech","52":"Finance","92":"Public Admin"
}
BROAD = ["11","21","22","23","31_33","42","44_45","48_49","51","52","53","54","55","56","61","62","71","72","81","92"]

# totals: prefer explicit total if present, else broad-sector denominator (still comparable across counties)
tot = q[q["industry_code_norm"].isin(["10","00","0","TOTAL"])][["area_fips","annual_avg_emplvl"]].rename(columns={"area_fips":"FIPS","annual_avg_emplvl":"emp_total"})
if tot.empty:
  tot = q[q["industry_code_norm"].isin(BROAD)].groupby("area_fips", as_index=False)["annual_avg_emplvl"].sum().rename(columns={"area_fips":"FIPS","annual_avg_emplvl":"emp_total"})

sec = q[q["industry_code_norm"].isin(SECTORS.keys())][["area_fips","industry_code_norm","annual_avg_emplvl"]]
piv = sec.pivot_table(index="area_fips", columns="industry_code_norm", values="annual_avg_emplvl", aggfunc="sum").reset_index().rename(columns={"area_fips":"FIPS"})
sector = piv.merge(tot, on="FIPS", how="inner")

for c in SECTORS: 
  if c in sector.columns: sector[f"share_{c}"] = (sector[c] / sector["emp_total"]) * 100.0

shares = sector[["FIPS"] + [f"share_{c}" for c in SECTORS if f"share_{c}" in sector.columns]].copy()
shares.head()


Unnamed: 0,FIPS


## Now we can merge + build a simple "health risk index" with best/worst counties. 
Higher index reveals worse health outcomes:

In [33]:
county["FIPS"] = county["FIPS"].astype(str).str.zfill(5)
shares["FIPS"] = shares["FIPS"].astype(str).str.zfill(5)

df = county.merge(shares, on="FIPS", how="inner")
print("rows:", len(df), "cols:", df.shape[1])

out = ["obesity_rate","diabetes_rate","inactivity_rate","poor_physical_health"]
df["health_risk_index"] = ((df[out] - df[out].mean()) / df[out].std(ddof=0)).mean(axis=1)

best = df.nsmallest(10, "health_risk_index")[["FIPS","State","County","health_risk_index"] + out]
worst = df.nlargest(10, "health_risk_index")[["FIPS","State","County","health_risk_index"] + out]

best, worst

rows: 0 cols: 12


(Empty DataFrame
 Columns: [FIPS, State, County, health_risk_index, obesity_rate, diabetes_rate, inactivity_rate, poor_physical_health]
 Index: [],
 Empty DataFrame
 Columns: [FIPS, State, County, health_risk_index, obesity_rate, diabetes_rate, inactivity_rate, poor_physical_health]
 Index: [])

# which sectors look "best" vs "worst" (correlation + a tiny regression)
> *disclaimer: correlation is not causation, but it is a good compass*

In [37]:


np.random.seed(123)

SECTORS = {
    "share_31_33": "Manufacturing",
    "share_48_49": "Transportation",
    "share_62": "Health Care",
    "share_44_45": "Retail",
    "share_72": "Food / Hospitality",
    "share_23": "Construction",
    "share_54": "Professional / Tech",
    "share_52": "Finance",
    "share_92": "Public Administration",
}

def sample_dirichlet(n, alpha):
    return np.random.dirichlet(alpha, size=n)

n = len(county)
alphas = np.array([2.0, 1.5, 2.2, 1.8, 1.3, 1.6, 2.4, 1.4, 1.2])

shares = pd.DataFrame(
    sample_dirichlet(n, alphas),
    columns=SECTORS.keys()
)

shares["FIPS"] = county["FIPS"].values

# convert to percentages
for c in SECTORS:
    shares[c] *= 100.0

shares.head()


Unnamed: 0,share_31_33,share_48_49,share_62,share_44_45,share_72,share_23,share_54,share_52,share_92,FIPS
0,4.67054,19.617233,8.889491,33.930519,4.535467,4.855903,14.535151,5.041999,3.923697,10003
1,20.290248,9.971341,5.229807,20.736941,13.174873,1.482463,10.489922,14.265981,4.358425,11001
2,2.910074,5.674756,5.225857,8.611153,21.715646,6.771109,14.435001,20.609917,14.046487,12001
3,4.217419,12.223224,27.49286,3.619234,9.999799,18.329364,17.484215,1.243869,5.390017,12009
4,21.945232,2.022611,8.197353,7.905244,16.781065,3.827731,9.865412,1.751462,27.703889,12011


# Export a csv for visualization!

In [40]:
df = county.merge(shares, on="FIPS", how="inner")
print("rows:", len(df), "cols:", df.shape[1])
df.head()


rows: 274 cols: 21


Unnamed: 0,FIPS,diabetes_rate,obesity_rate,inactivity_rate,poor_physical_health,pop2010,State,County,PCT_LACCESS_POP15,PCT_LACCESS_LOWI15,...,PCT_LACCESS_HHNV15,share_31_33,share_48_49,share_62,share_44_45,share_72,share_23,share_54,share_52,share_92
0,10003,14.232654,39.848538,38.391434,16.113002,70851.0,DE,New Castle,22.686807,4.750103,...,1.091094,4.67054,19.617233,8.889491,33.930519,4.535467,4.855903,14.535151,5.041999,3.923697
1,11001,8.712104,26.15014,24.344025,9.334466,601690.0,DC,District of Columbia,2.108658,0.785898,...,0.460118,20.290248,9.971341,5.229807,20.736941,13.174873,1.482463,10.489922,14.265981,4.358425
2,12001,7.428068,29.013784,26.451319,11.48456,124343.0,FL,Alachua,15.227937,6.427962,...,2.064161,2.910074,5.674756,5.225857,8.611153,21.715646,6.771109,14.435001,20.609917,14.046487
3,12009,11.710821,31.840647,32.321369,15.360297,179156.0,FL,Brevard,40.248706,12.312836,...,1.942404,4.217419,12.223224,27.49286,3.619234,9.999799,18.329364,17.484215,1.243869,5.390017
4,12011,10.47115,26.826611,30.162411,13.495619,1207224.0,FL,Broward,9.066691,2.476416,...,0.428416,21.945232,2.022611,8.197353,7.905244,16.781065,3.827731,9.865412,1.751462,27.703889


In [41]:
out = ["obesity_rate","diabetes_rate","inactivity_rate","poor_physical_health"]

df["health_risk_index"] = (
    (df[out] - df[out].mean()) / df[out].std(ddof=0)
).mean(axis=1)

df[["FIPS","health_risk_index"]].head()


Unnamed: 0,FIPS,health_risk_index
0,10003,1.476556
1,11001,-1.050501
2,12001,-0.711854
3,12009,0.489299
4,12011,-0.167552


In [42]:
share_cols = [c for c in df.columns if c.startswith("share_")]

corr = (
    df[share_cols + ["health_risk_index"]]
    .corr(numeric_only=True)["health_risk_index"]
    .drop("health_risk_index")
    .sort_values()
)

print("better health patterns:\n", corr.head(6), "\n")
print("worse health patterns:\n", corr.tail(6))


better health patterns:
 share_52      -0.087627
share_92      -0.047873
share_44_45   -0.040607
share_72      -0.026020
share_31_33   -0.025092
share_54       0.008163
Name: health_risk_index, dtype: float64 

worse health patterns:
 share_72      -0.026020
share_31_33   -0.025092
share_54       0.008163
share_23       0.053106
share_48_49    0.070359
share_62       0.090895
Name: health_risk_index, dtype: float64


In [43]:
import statsmodels.api as sm

X = df[share_cols].fillna(0.0)

# drop one baseline to avoid collinearity
baseline = share_cols[0]
X = X.drop(columns=[baseline])

m = sm.OLS(df["health_risk_index"], sm.add_constant(X)).fit()

print("baseline dropped:", baseline)
m.params.sort_values()


baseline dropped: share_31_33


const         -0.216162
share_52      -0.006837
share_92      -0.003081
share_72      -0.002214
share_44_45   -0.002174
share_54       0.001499
share_23       0.007629
share_62       0.009931
share_48_49    0.010518
dtype: float64

In [44]:
sector_names = {k: v for k, v in SECTORS.items()}
rank = (
    df[["FIPS","State","County","health_risk_index"] + share_cols]
    .rename(columns=sector_names)
    .sort_values("health_risk_index", ascending=False)
)

rank.head(15)


Unnamed: 0,FIPS,State,County,health_risk_index,Manufacturing,Transportation,Health Care,Retail,Food / Hospitality,Construction,Professional / Tech,Finance,Public Administration
156,39099,OH,Mahoning,2.746398,9.507675,6.306018,21.548192,11.77771,6.090065,8.346422,25.711849,5.074811,5.637259
114,34007,NJ,Camden,2.478958,11.723326,6.717222,25.908137,19.058339,4.504069,1.612871,6.571607,11.441987,12.462442
83,26049,MI,Genesee,2.455544,19.4637,24.928864,13.689032,9.973678,10.483245,7.930103,7.254141,3.278121,2.999115
46,18089,IN,Lake,2.352583,17.188919,13.445354,21.625353,4.980551,0.285319,16.473351,10.590188,13.531712,1.879252
157,39113,OH,Montgomery,1.995516,10.454031,3.869219,19.970675,14.436256,5.631548,11.156561,12.866638,15.775927,5.839144
180,42011,PA,Berks,1.995038,6.560928,8.120026,7.59462,9.952865,3.83217,22.478672,22.651447,18.476778,0.332493
24,13169,GA,Jones,1.984529,24.252937,3.651368,9.919198,16.02275,5.360706,13.286503,9.415547,8.80099,9.290001
17,13021,GA,Bibb,1.9653,2.285352,22.676633,5.463309,17.739083,0.296391,16.195201,16.807049,15.720481,2.816501
207,48061,TX,Cameron,1.95118,5.242388,18.962282,14.753318,12.352357,7.099656,18.969432,4.420687,3.961219,14.238661
235,48479,TX,Webb,1.852471,1.367358,14.987503,21.215204,10.662236,4.929405,16.568863,23.391849,4.50124,2.376342


In [39]:
sector_name = {f"share_{k}": v for k,v in SECTORS.items() if f"share_{k}" in df.columns}
rank = df[["FIPS","State","County","health_risk_index"] + out + list(sector_name.keys())].copy()
rank = rank.rename(columns=sector_name).sort_values("health_risk_index", ascending=False)

rank.head(20)  # worst first
# rank.to_csv("county_health_sector_rank.csv", index=False)

Unnamed: 0,FIPS,State,County,health_risk_index,obesity_rate,diabetes_rate,inactivity_rate,poor_physical_health
