# Question 2: How many qualify for new affordable senior housing opportunities?

This notebook analyzes eligibility for affordable senior housing based on multiple criteria:
- Age (62+)
- Income levels (census tract median income)
- Housing conditions (interior/exterior condition, property grade)
- Property violations (open violations)
- Residency stability
- Amenity accessibility (stores, parks)

We'll create a comprehensive eligibility scoring system to identify elderly residents who would benefit from affordable senior housing.

## Data Overview
- Total elderly: 7,396
- Mapped to buildings: 5,390
- Mapped with condition data: ~5,391
- Need to exclude: Residents already in rent-stabilized/income-restricted housing


In [98]:
import sys
import os

current_dir = os.getcwd()
if os.path.basename(current_dir) == 'notebooks':
    project_dir = os.path.dirname(current_dir)
else:
    parts = current_dir.split(os.sep)
    if 'fa25-team-a' in parts:
        idx = parts.index('fa25-team-a')
        project_dir = os.sep.join(parts[:idx+1])
    else:
        project_dir = os.path.dirname(os.path.dirname(current_dir))
web_app_path = os.path.join(project_dir, 'web_app')
sys.path.append(web_app_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from config.database import get_db_connection, execute_query

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


## 1. Census Tract Analysis with Elderly Distribution


In [99]:
query = """
SELECT 
    ct.tract_id,
    ct.tract_name,
    ct.median_income,
    COUNT(DISTINCT v.res_id) as elderly_count,
    AVG(v.age) as avg_age
FROM voters v
INNER JOIN voters_buildings_map vbm ON v.res_id = vbm.res_id
INNER JOIN census_tracts ct ON ST_Within(
    ST_SetSRID(ST_MakePoint(v.longitude, v.latitude), 4326),
    ct.geometry
)
WHERE v.is_elderly = true
    AND v.latitude IS NOT NULL 
    AND v.longitude IS NOT NULL
    AND ct.geometry IS NOT NULL
GROUP BY ct.tract_id, ct.tract_name, ct.median_income
ORDER BY elderly_count DESC
"""

df_tract = pd.DataFrame(execute_query(query, fetch_all=True))
print(f"Census Tracts with Mapped Elderly Residents:")
print(df_tract.to_string(index=False))

print(f"\nSummary Statistics:")
print(f"Total tracts with elderly: {len(df_tract)}")
print(f"Total elderly mapped: {df_tract['elderly_count'].sum():,}")
print(f"Average elderly per tract: {df_tract['elderly_count'].mean():.1f}")
print(f"Median income range: ${df_tract['median_income'].min():,.0f} - ${df_tract['median_income'].max():,.0f}")


Census Tracts with Mapped Elderly Residents:
   tract_id                                         tract_name median_income  elderly_count             avg_age
25025000505   Census Tract 5.05; Suffolk County; Massachusetts      80556.00            965 78.5673289183222958
25025000701   Census Tract 7.01; Suffolk County; Massachusetts      93326.00            350 77.8228882833787466
25025000402   Census Tract 4.02; Suffolk County; Massachusetts     111705.00            344 72.9455337690631808
25025000401   Census Tract 4.01; Suffolk County; Massachusetts      75366.00            335 71.9849246231155779
25025000502   Census Tract 5.02; Suffolk County; Massachusetts      82125.00            330 73.3465346534653465
25025000503   Census Tract 5.03; Suffolk County; Massachusetts      92560.00            273 74.7430730478589421
25025000301   Census Tract 3.01; Suffolk County; Massachusetts     131206.00            265 73.3767123287671233
25025000506   Census Tract 5.06; Suffolk County; Massachuse

## 2. Identify Income-Restricted Housing to Exclude


In [100]:
data_dir = os.path.join(project_dir, 'data', 'processed', 'elderly_analysis')
income_restricted = pd.read_csv(os.path.join(data_dir, 'income_restricted_projects_check.csv'))
addresses_df = pd.read_csv(os.path.join(data_dir, 'addresses_with_elderly_2plus_unique.csv'))

excluded_struct_ids = income_restricted[income_restricted['elderly_count'] > 0]['struct_id'].dropna().unique().tolist()
excluded_addresses = income_restricted[income_restricted['elderly_count'] > 0]['matched_address'].dropna().unique().tolist()

senior_housing_threshold = 50
senior_housing_addresses = addresses_df[addresses_df['elderly_count'] >= senior_housing_threshold].copy()
senior_housing_addresses = senior_housing_addresses.sort_values('elderly_count', ascending=False)

print(f"Income-restricted buildings to exclude: {len(excluded_struct_ids)}")
print(f"Excluded addresses from income-restricted projects: {len(excluded_addresses)}")
print(f"\nAddresses with {senior_housing_threshold}+ elderly (likely senior housing projects):")
print(senior_housing_addresses[['address', 'city', 'elderly_count', 'building_count']].to_string(index=False))

excluded_addresses.extend(senior_housing_addresses['address'].tolist())
excluded_addresses = list(set(excluded_addresses))

print(f"\nTotal excluded addresses (income-restricted + senior housing): {len(excluded_addresses)}")
print(f"\nTop senior housing addresses to exclude:")
for addr in senior_housing_addresses.head(10)['address'].tolist():
    count = senior_housing_addresses[senior_housing_addresses['address'] == addr]['elderly_count'].iloc[0]
    print(f"  {addr}: {count} elderly")


Income-restricted buildings to exclude: 5
Excluded addresses from income-restricted projects: 5

Addresses with 50+ elderly (likely senior housing projects):
                address          city  elderly_count  building_count
     30.0 Washington ST      BRIGHTON            219               1
    40.0 Wallingford RD      BRIGHTON            193               1
    28.0 Wallingford RD      BRIGHTON            179               2
     20.0 Washington ST        BOSTON            173               1
    30.0 Wallingford RD      BRIGHTON            155               1
130.0 CHESTNUT HILL AVE        BOSTON             79               3
       2400.0 BEACON ST      BRIGHTON             71               1
       2400.0 BEACON ST CHESTNUT HILL             71              81
         180.0 Corey RD      BRIGHTON             70               1
132.0 CHESTNUT HILL AVE        BOSTON             61               1
     91.0 Washington ST      BRIGHTON             59               1

Total exclude

## 3. Eligibility Feature Tables

### 3.1 Age Eligibility (62+)


In [101]:
if excluded_struct_ids:
    struct_ids_str = ','.join([f"'{sid}'" for sid in excluded_struct_ids])
    exclusion_condition = f"AND v.res_id NOT IN (SELECT DISTINCT vbm.res_id FROM voters_buildings_map vbm WHERE vbm.struct_id IN ({struct_ids_str}))"
else:
    exclusion_condition = ""

if excluded_addresses:
    addresses_str = ','.join([f"'{addr}'" for addr in excluded_addresses])
    address_exclusion = f"AND v.res_id NOT IN (SELECT DISTINCT v2.res_id FROM voters v2 WHERE v2.street_number || ' ' || v2.street_name IN ({addresses_str}))"
else:
    address_exclusion = ""

query = f"""
SELECT 
    v.res_id,
    v.age,
    CASE 
        WHEN v.age BETWEEN 62 AND 69 THEN '62-69'
        WHEN v.age BETWEEN 70 AND 79 THEN '70-79'
        WHEN v.age BETWEEN 80 AND 89 THEN '80-89'
        ELSE '90+'
    END as age_group,
    CASE 
        WHEN v.age >= 62 THEN true
        ELSE false
    END as age_eligible
FROM voters v
WHERE v.is_elderly = true
    {exclusion_condition}
    {address_exclusion}
ORDER BY v.age DESC
"""

df_age = pd.DataFrame(execute_query(query, fetch_all=True))

print("Age Eligibility Table:")
print(f"Total elderly (excluding income-restricted): {len(df_age):,}")
print(f"\nAge Distribution:")
print(df_age['age_group'].value_counts().sort_index())
print(f"\nAverage age: {df_age['age'].mean():.1f}")
print(f"Median age: {df_age['age'].median():.1f}")


Age Eligibility Table:
Total elderly (excluding income-restricted): 6,938

Age Distribution:
age_group
62-69    2430
70-79    2546
80-89    1425
90+       537
Name: count, dtype: int64

Average age: 74.7
Median age: 73.0


### 3.2 Income Levels (Census Tract Median Income)


In [102]:
if excluded_struct_ids:
    struct_ids_str = ','.join([f"'{sid}'" for sid in excluded_struct_ids])
    exclusion_condition = f"AND v.res_id NOT IN (SELECT DISTINCT vbm2.res_id FROM voters_buildings_map vbm2 WHERE vbm2.struct_id IN ({struct_ids_str}))"
else:
    exclusion_condition = ""

if excluded_addresses:
    addresses_str = ','.join([f"'{addr}'" for addr in excluded_addresses])
    address_exclusion = f"AND v.res_id NOT IN (SELECT DISTINCT v.res_id FROM voters v2 WHERE v2.street_number || ' ' || v2.street_name IN ({addresses_str}))"
else:
    address_exclusion = ""

query = f"""
SELECT DISTINCT ON (v.res_id)
    v.res_id,
    v.age,
    ct.tract_id,
    ct.tract_name,
    ct.median_income,
    CASE 
        WHEN ct.median_income < 50000 THEN 'Low Income'
        WHEN ct.median_income < 75000 THEN 'Moderate Income'
        ELSE 'Higher Income'
    END as income_category,
    CASE 
        WHEN ct.median_income < 50000 THEN 'Eligible'
        WHEN ct.median_income < 75000 THEN 'Potentially Eligible'
        ELSE 'Not Eligible'
    END as income_eligibility_status
FROM voters v
INNER JOIN voters_buildings_map vbm ON v.res_id = vbm.res_id
INNER JOIN census_tracts ct ON ST_Within(
    ST_SetSRID(ST_MakePoint(v.longitude, v.latitude), 4326),
    ct.geometry
)
WHERE v.is_elderly = true
    AND v.latitude IS NOT NULL 
    AND v.longitude IS NOT NULL
    AND ct.geometry IS NOT NULL
    {exclusion_condition}
    {address_exclusion}
ORDER BY v.res_id, ct.median_income
"""

df_income = pd.DataFrame(execute_query(query, fetch_all=True))

print("Income Eligibility Table:")
print(f"Total elderly with income data: {len(df_income):,}")
print(f"\nIncome Category Distribution:")
print(df_income['income_category'].value_counts())
print(f"\nIncome Eligibility Status:")
print(df_income['income_eligibility_status'].value_counts())
print(f"\nMedian Income Statistics:")
print(df_income['median_income'].describe())


Income Eligibility Table:
Total elderly with income data: 4,757

Income Category Distribution:
income_category
Higher Income      3720
Low Income          562
Moderate Income     475
Name: count, dtype: int64

Income Eligibility Status:
income_eligibility_status
Not Eligible            3720
Eligible                 562
Potentially Eligible     475
Name: count, dtype: int64

Median Income Statistics:
count         4757
unique          21
top       80556.00
freq           731
Name: median_income, dtype: object


### 3.3 Housing Conditions (Interior/Exterior Condition, Property Grade)


In [103]:
if excluded_struct_ids:
    struct_ids_str = ','.join([f"'{sid}'" for sid in excluded_struct_ids])
    exclusion_condition = f"AND v.res_id NOT IN (SELECT DISTINCT vbm.res_id FROM voters_buildings_map vbm WHERE vbm.struct_id IN ({struct_ids_str}))"
else:
    exclusion_condition = ""

if excluded_addresses:
    addresses_str = ','.join([f"'{addr}'" for addr in excluded_addresses])
    address_exclusion = f"AND v.res_id NOT IN (SELECT DISTINCT v2.res_id FROM voters v2 WHERE v2.street_number || ' ' || v2.street_name IN ({addresses_str}))"
else:
    address_exclusion = ""

query = f"""
SELECT 
    v.res_id,
    v.age,
    ehc.interior_condition,
    ehc.exterior_condition,
    ehc.grade,
    ehc.property_type,
    ehc.property_age,
    CASE 
        WHEN ehc.interior_condition IN ('Poor', 'Fair') OR 
             ehc.exterior_condition IN ('Poor', 'Fair') OR
             ehc.grade IN ('Poor', 'Fair') THEN true
        ELSE false
    END as has_poor_conditions,
    CASE 
        WHEN ehc.interior_condition = 'Poor' OR ehc.exterior_condition = 'Poor' OR ehc.grade = 'Poor' THEN 'Poor'
        WHEN ehc.interior_condition = 'Fair' OR ehc.exterior_condition = 'Fair' OR ehc.grade = 'Fair' THEN 'Fair'
        ELSE 'Good'
    END as overall_condition
FROM voters v
INNER JOIN elderly_housing_conditions ehc ON v.res_id = ehc.res_id
WHERE v.is_elderly = true
    {exclusion_condition}
    {address_exclusion}
GROUP BY v.res_id, v.age, ehc.interior_condition, ehc.exterior_condition, 
         ehc.grade, ehc.property_type, ehc.property_age
"""

df_conditions = pd.DataFrame(execute_query(query, fetch_all=True))

print("Housing Conditions Table:")
print(f"Total elderly with condition data: {len(df_conditions):,}")
print(f"\nInterior Condition Distribution:")
print(df_conditions['interior_condition'].value_counts(dropna=False))
print(f"\nExterior Condition Distribution:")
print(df_conditions['exterior_condition'].value_counts(dropna=False))
print(f"\nProperty Grade Distribution:")
print(df_conditions['grade'].value_counts(dropna=False))
print(f"\nOverall Condition:")
print(df_conditions['overall_condition'].value_counts())
print(f"\nElderly with Poor/Fair Conditions: {df_conditions['has_poor_conditions'].sum():,} ({df_conditions['has_poor_conditions'].sum()/len(df_conditions)*100:.1f}%)")


Housing Conditions Table:
Total elderly with condition data: 4,953

Interior Condition Distribution:
interior_condition
Average      1890
Unknown      1843
None          732
Good          404
Fair           74
Excellent       6
Poor            4
Name: count, dtype: int64

Exterior Condition Distribution:
exterior_condition
Average      2106
Unknown      1843
None          732
Good          198
Fair           71
Excellent       3
Name: count, dtype: int64

Property Grade Distribution:
grade
Average    2076
Unknown    1843
None        732
Good        299
Fair          3
Name: count, dtype: int64

Overall Condition:
overall_condition
Good    4820
Fair     129
Poor       4
Name: count, dtype: int64

Elderly with Poor/Fair Conditions: 133 (2.7%)


### 3.4 Property Violations (Open Violations)


In [104]:
if excluded_struct_ids:
    struct_ids_str = ','.join([f"'{sid}'" for sid in excluded_struct_ids])
    exclusion_condition = f"AND v.res_id NOT IN (SELECT DISTINCT vbm.res_id FROM voters_buildings_map vbm WHERE vbm.struct_id IN ({struct_ids_str}))"
else:
    exclusion_condition = ""

if excluded_addresses:
    addresses_str = ','.join([f"'{addr}'" for addr in excluded_addresses])
    address_exclusion = f"AND v.res_id NOT IN (SELECT DISTINCT v2.res_id FROM voters v2 WHERE v2.street_number || ' ' || v2.street_name IN ({addresses_str}))"
else:
    address_exclusion = ""

query = f"""
SELECT 
    v.res_id,
    v.age,
    COALESCE(evs.open_violations, 0) as open_violations,
    COALESCE(evs.total_violations, 0) as total_violations,
    COALESCE(evs.closed_violations, 0) as closed_violations,
    CASE 
        WHEN COALESCE(evs.open_violations, 0) > 0 THEN true
        ELSE false
    END as has_violations,
    CASE 
        WHEN COALESCE(evs.open_violations, 0) = 0 THEN 'No Violations'
        WHEN COALESCE(evs.open_violations, 0) <= 2 THEN '1-2 Violations'
        WHEN COALESCE(evs.open_violations, 0) <= 5 THEN '3-5 Violations'
        ELSE '6+ Violations'
    END as violation_category
FROM voters v
LEFT JOIN elderly_violations_one_to_one_summary evs ON v.res_id = evs.res_id
WHERE v.is_elderly = true
    {exclusion_condition}
    {address_exclusion}
"""

df_violations = pd.DataFrame(execute_query(query, fetch_all=True))

print("Property Violations Table:")
print(f"Total elderly analyzed: {len(df_violations):,}")
print(f"\nElderly with Open Violations: {df_violations['has_violations'].sum():,} ({df_violations['has_violations'].sum()/len(df_violations)*100:.1f}%)")
print(f"\nViolation Category Distribution:")
print(df_violations['violation_category'].value_counts())
print(f"\nOpen Violations Statistics:")
print(df_violations[df_violations['open_violations'] > 0]['open_violations'].describe())


Property Violations Table:
Total elderly analyzed: 6,938

Elderly with Open Violations: 35 (0.5%)

Violation Category Distribution:
violation_category
No Violations     6903
1-2 Violations      35
Name: count, dtype: int64

Open Violations Statistics:
count    35.000000
mean      1.028571
std       0.169031
min       1.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       2.000000
Name: open_violations, dtype: float64


### 3.5 Residency Stability


In [105]:
if excluded_struct_ids:
    struct_ids_str = ','.join([f"'{sid}'" for sid in excluded_struct_ids])
    exclusion_condition = f"AND v.res_id NOT IN (SELECT DISTINCT vbm2.res_id FROM voters_buildings_map vbm2 WHERE vbm2.struct_id IN ({struct_ids_str}))"
else:
    exclusion_condition = ""

if excluded_addresses:
    addresses_str = ','.join([f"'{addr}'" for addr in excluded_addresses])
    address_exclusion = f"AND v.res_id NOT IN (SELECT DISTINCT v2.res_id FROM voters v2 WHERE v2.street_number || ' ' || v2.street_name IN ({addresses_str}))"
else:
    address_exclusion = ""

query = f"""
SELECT DISTINCT ON (v.res_id)
    v.res_id,
    v.age,
    v.street_number || ' ' || v.street_name as address,
    CASE 
        WHEN EXISTS (SELECT 1 FROM voters_buildings_map vbm WHERE vbm.res_id = v.res_id) THEN true
        ELSE false
    END as has_building_match,
    CASE 
        WHEN EXISTS (SELECT 1 FROM voters_buildings_map vbm WHERE vbm.res_id = v.res_id) THEN '5+ Years (Matched)'
        ELSE 'Unknown'
    END as residency_stability
FROM voters v
WHERE v.is_elderly = true
    {exclusion_condition}
    {address_exclusion}
ORDER BY v.res_id
"""

df_stability = pd.DataFrame(execute_query(query, fetch_all=True))

print("Residency Stability Table:")
print(f"Total elderly analyzed: {len(df_stability):,}")
print(f"\nElderly with Building Match (5+ years residency): {df_stability['has_building_match'].sum():,} ({df_stability['has_building_match'].sum()/len(df_stability)*100:.1f}%)")
print(f"\nResidency Stability Distribution:")
print(df_stability['residency_stability'].value_counts())


Residency Stability Table:
Total elderly analyzed: 6,938

Elderly with Building Match (5+ years residency): 4,933 (71.1%)

Residency Stability Distribution:
residency_stability
5+ Years (Matched)    4933
Unknown               2005
Name: count, dtype: int64


### 3.6 Amenity Accessibility - Stores


In [106]:
if excluded_struct_ids:
    struct_ids_str = ','.join([f"'{sid}'" for sid in excluded_struct_ids])
    exclusion_condition = f"AND v.res_id NOT IN (SELECT DISTINCT vbm.res_id FROM voters_buildings_map vbm WHERE vbm.struct_id IN ({struct_ids_str}))"
else:
    exclusion_condition = ""

if excluded_addresses:
    addresses_str = ','.join([f"'{addr}'" for addr in excluded_addresses])
    address_exclusion = f"AND v.res_id NOT IN (SELECT DISTINCT v2.res_id FROM voters v2 WHERE v2.street_number || ' ' || v2.street_name IN ({addresses_str}))"
else:
    address_exclusion = ""

query = f"""
SELECT 
    v.res_id,
    v.age,
    MIN(vsn.distance_meters) as nearest_store_distance,
    COUNT(vsn.store_id) as nearby_stores_count,
    CASE 
        WHEN MIN(vsn.distance_meters) <= 500 THEN 'Excellent (≤500m)'
        WHEN MIN(vsn.distance_meters) <= 1000 THEN 'Good (500-1000m)'
        WHEN MIN(vsn.distance_meters) IS NOT NULL THEN 'Limited (>1000m)'
        ELSE 'No Store Data'
    END as store_accessibility
FROM voters v
LEFT JOIN voter_store_nearby vsn ON v.res_id = vsn.res_id
WHERE v.is_elderly = true
    {exclusion_condition}
    {address_exclusion}
GROUP BY v.res_id, v.age
"""

df_stores = pd.DataFrame(execute_query(query, fetch_all=True))

print("Store Accessibility Table:")
print(f"Total elderly analyzed: {len(df_stores):,}")
print(f"\nStore Accessibility Distribution:")
print(df_stores['store_accessibility'].value_counts())
print(f"\nElderly with Store Data: {(df_stores['nearest_store_distance'].notna()).sum():,}")
if df_stores['nearest_store_distance'].notna().any():
    print(f"\nAverage Distance to Nearest Store: {df_stores['nearest_store_distance'].mean():.0f}m")
    print(f"Median Distance to Nearest Store: {df_stores['nearest_store_distance'].median():.0f}m")


Store Accessibility Table:
Total elderly analyzed: 6,938

Store Accessibility Distribution:
store_accessibility
Excellent (≤500m)    3391
No Store Data        3340
Good (500-1000m)      203
Limited (>1000m)        4
Name: count, dtype: int64

Elderly with Store Data: 3,598

Average Distance to Nearest Store: 259m
Median Distance to Nearest Store: 227m


### 3.7 Amenity Accessibility - Parks


In [107]:
if excluded_struct_ids:
    struct_ids_str = ','.join([f"'{sid}'" for sid in excluded_struct_ids])
    exclusion_condition = f"AND v.res_id NOT IN (SELECT DISTINCT vbm.res_id FROM voters_buildings_map vbm WHERE vbm.struct_id IN ({struct_ids_str}))"
else:
    exclusion_condition = ""

if excluded_addresses:
    addresses_str = ','.join([f"'{addr}'" for addr in excluded_addresses])
    address_exclusion = f"AND v.res_id NOT IN (SELECT DISTINCT v2.res_id FROM voters v2 WHERE v2.street_number || ' ' || v2.street_name IN ({addresses_str}))"
else:
    address_exclusion = ""

query = f"""
SELECT 
    v.res_id,
    v.age,
    MIN(
        ST_Distance(
            ST_SetSRID(ST_MakePoint(v.longitude, v.latitude), 4326)::geography,
            ST_Transform(ST_SetSRID(p.geom, 26986), 4326)::geography
        )
    ) as nearest_park_distance_meters
FROM voters v
LEFT JOIN geo_parks p ON ST_DWithin(
    ST_SetSRID(ST_MakePoint(v.longitude, v.latitude), 4326)::geography,
    ST_Transform(ST_SetSRID(p.geom, 26986), 4326)::geography,
    5000
)
WHERE v.is_elderly = true
    AND v.latitude IS NOT NULL 
    AND v.longitude IS NOT NULL
    AND p.geom IS NOT NULL
    {exclusion_condition}
    {address_exclusion}
GROUP BY v.res_id, v.age, v.latitude, v.longitude
"""

try:
    df_parks = pd.DataFrame(execute_query(query, fetch_all=True))
    if len(df_parks) == 0:
        raise ValueError("Query returned empty result")
except Exception as e:
    print(f"Note: Parks query failed ({e}). Creating empty parks data...")
    df_parks = df_age[['res_id', 'age']].copy()
    df_parks['nearest_park_distance_meters'] = None

if 'nearest_park_distance_meters' not in df_parks.columns:
    df_parks['nearest_park_distance_meters'] = None

df_parks['park_accessibility'] = df_parks['nearest_park_distance_meters'].apply(
    lambda x: 'Excellent (≤300m)' if pd.notna(x) and x <= 300
    else 'Good (300-600m)' if pd.notna(x) and x <= 600
    else 'Limited (>600m)' if pd.notna(x)
    else 'No Park Data'
)

print("Park Accessibility Table:")
print(f"Total elderly analyzed: {len(df_parks):,}")
print(f"\nPark Accessibility Distribution:")
print(df_parks['park_accessibility'].value_counts())
print(f"\nElderly with Park Data: {(df_parks['nearest_park_distance_meters'].notna()).sum():,}")
if df_parks['nearest_park_distance_meters'].notna().any():
    print(f"\nAverage Distance to Nearest Park: {df_parks['nearest_park_distance_meters'].mean():.0f}m")
    print(f"Median Distance to Nearest Park: {df_parks['nearest_park_distance_meters'].median():.0f}m")


Park Accessibility Table:
Total elderly analyzed: 6,913

Park Accessibility Distribution:
park_accessibility
Excellent (≤300m)    6326
Good (300-600m)       583
Limited (>600m)         4
Name: count, dtype: int64

Elderly with Park Data: 6,913

Average Distance to Nearest Park: 145m
Median Distance to Nearest Park: 127m


In [108]:
if 'park_accessibility' not in df_parks.columns:
    if 'nearest_park_distance_meters' in df_parks.columns:
        df_parks['park_accessibility'] = df_parks['nearest_park_distance_meters'].apply(
            lambda x: 'Excellent (≤300m)' if pd.notna(x) and x <= 300
            else 'Good (300-600m)' if pd.notna(x) and x <= 600
            else 'Limited (>600m)' if pd.notna(x)
            else 'No Park Data'
        )
    else:
        df_parks['nearest_park_distance_meters'] = None
        df_parks['park_accessibility'] = 'No Park Data'


In [109]:
df_eligibility = df_age[['res_id', 'age', 'age_group']].copy()

df_eligibility = df_eligibility.merge(
    df_income[['res_id', 'tract_id', 'tract_name', 'median_income', 'income_category', 'income_eligibility_status']],
    on='res_id',
    how='left'
)

df_eligibility = df_eligibility.merge(
    df_conditions[['res_id', 'interior_condition', 'exterior_condition', 'grade', 'has_poor_conditions', 'overall_condition']],
    on='res_id',
    how='left'
)

df_eligibility = df_eligibility.merge(
    df_violations[['res_id', 'open_violations', 'total_violations', 'has_violations', 'violation_category']],
    on='res_id',
    how='left'
)

df_eligibility = df_eligibility.merge(
    df_stability[['res_id', 'has_building_match', 'residency_stability']],
    on='res_id',
    how='left'
)

df_eligibility = df_eligibility.merge(
    df_stores[['res_id', 'nearest_store_distance', 'store_accessibility']],
    on='res_id',
    how='left'
)

df_eligibility = df_eligibility.merge(
    df_parks[['res_id', 'nearest_park_distance_meters', 'park_accessibility']],
    on='res_id',
    how='left'
)

print(f"Comprehensive Eligibility Table:")
print(f"Total elderly in analysis: {len(df_eligibility):,}")
print(f"\nColumns: {list(df_eligibility.columns)}")


Comprehensive Eligibility Table:
Total elderly in analysis: 6,958

Columns: ['res_id', 'age', 'age_group', 'tract_id', 'tract_name', 'median_income', 'income_category', 'income_eligibility_status', 'interior_condition', 'exterior_condition', 'grade', 'has_poor_conditions', 'overall_condition', 'open_violations', 'total_violations', 'has_violations', 'violation_category', 'has_building_match', 'residency_stability', 'nearest_store_distance', 'store_accessibility', 'nearest_park_distance_meters', 'park_accessibility']


## 5. Eligibility Scoring System

Calculate priority scores based on need factors


In [110]:
df_eligibility['eligibility_score'] = 0

df_eligibility.loc[df_eligibility['income_category'] == 'Low Income', 'eligibility_score'] += 25
df_eligibility.loc[df_eligibility['income_category'] == 'Moderate Income', 'eligibility_score'] += 15

df_eligibility.loc[df_eligibility['overall_condition'] == 'Poor', 'eligibility_score'] += 20
df_eligibility.loc[df_eligibility['overall_condition'] == 'Fair', 'eligibility_score'] += 10

df_eligibility.loc[df_eligibility['has_violations'] == True, 'eligibility_score'] += 15

df_eligibility.loc[df_eligibility['store_accessibility'] == 'Excellent (≤500m)', 'eligibility_score'] += 0
df_eligibility.loc[df_eligibility['store_accessibility'] == 'Good (500-1000m)', 'eligibility_score'] += 3
df_eligibility.loc[df_eligibility['store_accessibility'] == 'Limited (>1000m)', 'eligibility_score'] += 5
df_eligibility.loc[df_eligibility['store_accessibility'] == 'No Store Data', 'eligibility_score'] += 2

df_eligibility.loc[df_eligibility['park_accessibility'] == 'Excellent (≤300m)', 'eligibility_score'] += 0
df_eligibility.loc[df_eligibility['park_accessibility'] == 'Good (300-600m)', 'eligibility_score'] += 3
df_eligibility.loc[df_eligibility['park_accessibility'] == 'Limited (>600m)', 'eligibility_score'] += 5
df_eligibility.loc[df_eligibility['park_accessibility'] == 'No Park Data', 'eligibility_score'] += 2

df_eligibility['priority_level'] = pd.cut(
    df_eligibility['eligibility_score'],
    bins=[-1, 10, 25, 35, 100],
    labels=['Low', 'Medium', 'High', 'Very High']
)

print("Eligibility Score Distribution:")
print(df_eligibility['eligibility_score'].describe())
print(f"\nPriority Level Distribution:")
print(df_eligibility['priority_level'].value_counts().sort_index())
print(f"\nTop 10 Highest Priority Elderly:")
print(df_eligibility.nlargest(10, 'eligibility_score')[['res_id', 'age', 'income_category', 'overall_condition', 'has_violations', 'eligibility_score', 'priority_level']].to_string(index=False))


Eligibility Score Distribution:
count    6958.000000
mean        4.631791
std         7.703547
min         0.000000
25%         0.000000
50%         2.000000
75%         3.000000
max        40.000000
Name: eligibility_score, dtype: float64

Priority Level Distribution:
priority_level
Low          5868
Medium        921
High          164
Very High       5
Name: count, dtype: int64

Top 10 Highest Priority Elderly:
      res_id  age income_category overall_condition  has_violations  eligibility_score priority_level
10LTN1853000   72      Low Income              Good            True                 40      Very High
01KDD1257000   68      Low Income              Good            True                 40      Very High
01FJS2642000   83      Low Income              Fair           False                 38      Very High
06FMY2744001   81      Low Income              Fair           False                 38      Very High
05SCY2055000   70 Moderate Income              Poor           False      

## 6. Summary Statistics by Feature


In [111]:
print("=" * 80)
print("ELIGIBILITY ANALYSIS SUMMARY")
print("=" * 80)

print(f"\n1. AGE ELIGIBILITY")
print(f"   Total elderly (62+): {len(df_age):,}")
print(f"   Age range: {df_age['age'].min()} - {df_age['age'].max()}")
print(f"   Average age: {df_age['age'].mean():.1f}")

print(f"\n2. INCOME ELIGIBILITY")
print(f"   Total with income data: {len(df_income):,}")
print(f"   Low Income (<$50k): {(df_income['income_category'] == 'Low Income').sum():,}")
print(f"   Moderate Income ($50k-$75k): {(df_income['income_category'] == 'Moderate Income').sum():,}")
print(f"   Higher Income (>$75k): {(df_income['income_category'] == 'Higher Income').sum():,}")

print(f"\n3. HOUSING CONDITIONS")
print(f"   Total with condition data: {len(df_conditions):,}")
print(f"   Poor/Fair conditions: {df_conditions['has_poor_conditions'].sum():,} ({df_conditions['has_poor_conditions'].sum()/len(df_conditions)*100:.1f}%)")

print(f"\n4. PROPERTY VIOLATIONS")
print(f"   Total analyzed: {len(df_violations):,}")
print(f"   With open violations: {df_violations['has_violations'].sum():,} ({df_violations['has_violations'].sum()/len(df_violations)*100:.1f}%)")

print(f"\n5. RESIDENCY STABILITY")
print(f"   Total analyzed: {len(df_stability):,}")
print(f"   With building match (5+ years): {df_stability['has_building_match'].sum():,} ({df_stability['has_building_match'].sum()/len(df_stability)*100:.1f}%)")

print(f"\n6. STORE ACCESSIBILITY")
print(f"   Total analyzed: {len(df_stores):,}")
print(f"   Excellent (≤500m): {(df_stores['store_accessibility'] == 'Excellent (≤500m)').sum():,}")
print(f"   Good (500-1000m): {(df_stores['store_accessibility'] == 'Good (500-1000m)').sum():,}")
print(f"   Limited (>1000m): {(df_stores['store_accessibility'] == 'Limited (>1000m)').sum():,}")

print(f"\n7. PARK ACCESSIBILITY")
print(f"   Total analyzed: {len(df_parks):,}")
print(f"   Excellent (≤300m): {(df_parks['park_accessibility'] == 'Excellent (≤300m)').sum():,}")
print(f"   Good (300-600m): {(df_parks['park_accessibility'] == 'Good (300-600m)').sum():,}")
print(f"   Limited (>600m): {(df_parks['park_accessibility'] == 'Limited (>600m)').sum():,}")

print(f"\n8. OVERALL ELIGIBILITY SCORES")
print(f"   Total in comprehensive analysis: {len(df_eligibility):,}")
print(f"   Very High Priority: {(df_eligibility['priority_level'] == 'Very High').sum():,}")
print(f"   High Priority: {(df_eligibility['priority_level'] == 'High').sum():,}")
print(f"   Medium Priority: {(df_eligibility['priority_level'] == 'Medium').sum():,}")
print(f"   Low Priority: {(df_eligibility['priority_level'] == 'Low').sum():,}")

print("\n" + "=" * 80)


ELIGIBILITY ANALYSIS SUMMARY

1. AGE ELIGIBILITY
   Total elderly (62+): 6,938
   Age range: 62 - 106
   Average age: 74.7

2. INCOME ELIGIBILITY
   Total with income data: 4,757
   Low Income (<$50k): 562
   Moderate Income ($50k-$75k): 475
   Higher Income (>$75k): 3,720

3. HOUSING CONDITIONS
   Total with condition data: 4,953
   Poor/Fair conditions: 133 (2.7%)

4. PROPERTY VIOLATIONS
   Total analyzed: 6,938
   With open violations: 35 (0.5%)

5. RESIDENCY STABILITY
   Total analyzed: 6,938
   With building match (5+ years): 4,933 (71.1%)

6. STORE ACCESSIBILITY
   Total analyzed: 6,938
   Excellent (≤500m): 3,391
   Good (500-1000m): 203
   Limited (>1000m): 4

7. PARK ACCESSIBILITY
   Total analyzed: 6,913
   Excellent (≤300m): 6,326
   Good (300-600m): 583
   Limited (>600m): 4

8. OVERALL ELIGIBILITY SCORES
   Total in comprehensive analysis: 6,958
   Very High Priority: 5
   High Priority: 164
   Medium Priority: 921
   Low Priority: 5,868



## 7. Export Results


In [112]:
output_dir = os.path.join(project_dir, 'data', 'processed', 'elderly_analysis')
os.makedirs(output_dir, exist_ok=True)

df_tract.to_csv(f'{output_dir}/census_tract_elderly_distribution.csv', index=False)
df_age.to_csv(f'{output_dir}/age_eligibility.csv', index=False)
df_income.to_csv(f'{output_dir}/income_eligibility.csv', index=False)
df_conditions.to_csv(f'{output_dir}/housing_conditions_eligibility.csv', index=False)
df_violations.to_csv(f'{output_dir}/violations_eligibility.csv', index=False)
df_stability.to_csv(f'{output_dir}/residency_stability.csv', index=False)
df_stores.to_csv(f'{output_dir}/store_accessibility.csv', index=False)
df_parks.to_csv(f'{output_dir}/park_accessibility.csv', index=False)
df_eligibility.to_csv(f'{output_dir}/comprehensive_eligibility_analysis.csv', index=False)

print("All tables exported to:")
print(f"  {output_dir}/")
print("\nFiles created:")
print("  - census_tract_elderly_distribution.csv")
print("  - age_eligibility.csv")
print("  - income_eligibility.csv")
print("  - housing_conditions_eligibility.csv")
print("  - violations_eligibility.csv")
print("  - residency_stability.csv")
print("  - store_accessibility.csv")
print("  - park_accessibility.csv")
print("  - comprehensive_eligibility_analysis.csv")


All tables exported to:
  /Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/data/processed/elderly_analysis/

Files created:
  - census_tract_elderly_distribution.csv
  - age_eligibility.csv
  - income_eligibility.csv
  - housing_conditions_eligibility.csv
  - violations_eligibility.csv
  - residency_stability.csv
  - store_accessibility.csv
  - park_accessibility.csv
  - comprehensive_eligibility_analysis.csv
