In [4]:
# Step 1: Clean output.txt by removing leading serial numbers like "1. 1 Adamson St" -> "1 Adamson St"
# Step 2: Create output4.txt with unique values from the cleaned file (case-insensitive unique)
from pathlib import Path
import re

base = Path('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/notebooks')
input_path = base / 'output.txt'
cleaned_path = base / 'output1.txt'
unique_path = base / 'output4.txt'

pattern = re.compile(r'^\s*\d+\.\s*')  # matches leading "number. "

# Read input
raw_lines = input_path.read_text(encoding='utf-8', errors='ignore').splitlines()

# Step 1: remove leading numbering
cleaned_lines = []
for ln in raw_lines:
    s = pattern.sub('', ln).strip()
    if s:
        cleaned_lines.append(s)

# Write cleaned file (output1.txt)
cleaned_path.write_text('\n'.join(cleaned_lines) + '\n', encoding='utf-8')
print(f"Cleaned lines written to: {cleaned_path} ({len(cleaned_lines)} lines)")

# Step 2: unique values (case-insensitive), preserve first-seen casing
seen = set()
unique_lines = []
for s in cleaned_lines:
    key = s.strip().lower()
    if key and key not in seen:
        seen.add(key)
        unique_lines.append(s.strip())

# Write uniques to output4.txt
unique_path.write_text('\n'.join(unique_lines) + '\n', encoding='utf-8')
print(f"Unique lines written to: {unique_path} ({len(unique_lines)} unique)")


Cleaned lines written to: /Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/notebooks/output1.txt (8758 lines)
Unique lines written to: /Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/notebooks/output4.txt (8758 unique)


In [5]:
# Transform output4.txt and output3.txt -> drop last token; title-case for output3 path
from pathlib import Path

base = Path('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/notebooks')

src4 = base / 'output4.txt'
src3 = base / 'output3.txt'

dst5 = base / 'output5.txt'  # from output4, drop last token

# from output3, drop last token AND title-case each word
# (Initialize only the first letter of every word)
dst6 = base / 'output6.txt'

# Helpers

def drop_last_token(s: str) -> str:
    s = s.strip()
    if not s:
        return s
    # find last space; remove the trailing token
    i = s.rfind(' ')
    return s if i == -1 else s[:i].strip()

# Process output4 -> output5
lines4 = src4.read_text(encoding='utf-8', errors='ignore').splitlines() if src4.exists() else []
proc5 = [drop_last_token(ln) for ln in lines4 if ln.strip()]
dst5.write_text('\n'.join(proc5) + '\n', encoding='utf-8')
print(f"output5.txt written: {len(proc5)} lines (from output4.txt)")

# Process output3 -> output6 (drop last token then title-case)
lines3 = src3.read_text(encoding='utf-8', errors='ignore').splitlines() if src3.exists() else []
proc6 = []
for ln in lines3:
    if not ln.strip():
        continue
    base_part = drop_last_token(ln)
    # Title-case: initialize first letter of every word; keep leading number intact
    parts = base_part.split(' ', 1)
    if parts and parts[0].isdigit():
        first = parts[0]
        rest = parts[1].title() if len(parts) > 1 else ''
        out = (first + (' ' + rest if rest else '')).strip()
    else:
        out = base_part.title()
    proc6.append(out)

dst6.write_text('\n'.join(proc6) + '\n', encoding='utf-8')
print(f"output6.txt written: {len(proc6)} lines (from output3.txt)")

# Example preview
for i in range(3):
    if i < len(lines3):
        print(f"ex: '{lines3[i]}' -> '{proc6[i] if i < len(proc6) else ''}'")


output5.txt written: 8758 lines (from output4.txt)
output6.txt written: 8144 lines (from output3.txt)
ex: '1 ADAMSON ST' -> '1 Adamson'
ex: '1 ALDIE ST' -> '1 Aldie'
ex: '1 ALLEN RD' -> '1 Allen'


In [6]:
# Compare output5.txt vs output6.txt (case-insensitive)
from pathlib import Path

base = Path('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/notebooks')
p5 = base / 'output5.txt'
p6 = base / 'output6.txt'

lines5 = p5.read_text(encoding='utf-8', errors='ignore').splitlines() if p5.exists() else []
lines6 = p6.read_text(encoding='utf-8', errors='ignore').splitlines() if p6.exists() else []

norm5 = {ln.strip().lower() for ln in lines5 if ln.strip()}
norm6 = {ln.strip().lower() for ln in lines6 if ln.strip()}

match = norm5 & norm6
only5 = norm5 - norm6
only6 = norm6 - norm5

print(f'Total lines in output5.txt: {len(lines5)}')
print(f'Total lines in output6.txt: {len(lines6)}')
print(f'Unique (normalized) output5: {len(norm5)}')
print(f'Unique (normalized) output6: {len(norm6)}')
print(f'Matches (case-insensitive): {len(match)}')
print(f"Only in output5: {len(only5)} | Only in output6: {len(only6)}")

# Preview a few
for i, addr in enumerate(sorted(list(match))[:10], 1):
    print(f'{i}. {addr}')


Total lines in output5.txt: 8758
Total lines in output6.txt: 8144
Unique (normalized) output5: 8735
Unique (normalized) output6: 8112
Matches (case-insensitive): 6732
Only in output5: 2003 | Only in output6: 1380
1. 1 adamson
2. 1 aldie
3. 1 allen
4. 1 amboy
5. 1 appian
6. 1 arden
7. 1 ashford
8. 1 boulevard
9. 1 buswell
10. 1 chiswick


In [7]:
# Map voter addresses (output 2.txt) to unique addresses (output6.txt) using the same normalization
from pathlib import Path
import csv

base = Path('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/notebooks')
p2 = base / 'output 2.txt'     # all voter addresses (duplicates)
p6 = base / 'output6.txt'      # unique, title-cased, last token dropped
out_csv = base / 'voter_address_mapping.csv'

def drop_last_token(s: str) -> str:
    s = s.strip()
    i = s.rfind(' ')
    return s if i == -1 else s[:i].strip()

# Load unique canonical set from output6 and build lookup by normalized key
uniq6 = [ln.strip() for ln in p6.read_text(encoding='utf-8', errors='ignore').splitlines() if ln.strip()]
lookup = {}
for u in uniq6:
    key = drop_last_token(u).strip().lower()  # dropping last token again is idempotent for u
    # Actually u already had last token dropped; we just normalize to lowercase for keys
    key = u.strip().lower()
    if key not in lookup:
        lookup[key] = u  # keep title-cased version

# Process voter addresses
rows = []
lines2 = [ln.strip() for ln in p2.read_text(encoding='utf-8', errors='ignore').splitlines() if ln.strip()]
for addr in lines2:
    base_norm = drop_last_token(addr).strip().lower()
    matched = lookup.get(base_norm)
    rows.append({
        'original_address': addr,
        'normalized_base': base_norm,
        'matched': 'yes' if matched else 'no',
        'matched_unique': matched or ''
    })

# Write mapping to CSV
with out_csv.open('w', newline='', encoding='utf-8') as f:
    w = csv.DictWriter(f, fieldnames=['original_address','normalized_base','matched','matched_unique'])
    w.writeheader()
    w.writerows(rows)

total = len(rows)
matches = sum(1 for r in rows if r['matched'] == 'yes')
print(f"Wrote {out_csv}  | total={total}  matches={matches}  match_rate={matches/total:.2%}")

Wrote /Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/notebooks/voter_address_mapping.csv  | total=43759  matches=43759  match_rate=100.00%


In [8]:
# Print sample addresses from buildings and voters tables
import sys
sys.path.append('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/web_app')
from config.database import execute_query

print("Buildings (sample):")
rows = execute_query("""
    SELECT
      TRIM(
        CONCAT(
          COALESCE(NULLIF(TRIM(st_num), ''), ''),
          CASE WHEN TRIM(COALESCE(st_num2, '')) <> '' THEN ' ' || TRIM(st_num2) ELSE '' END,
          CASE WHEN TRIM(COALESCE(st_name, '')) <> '' THEN ' ' || TRIM(st_name) ELSE '' END,
          CASE WHEN TRIM(COALESCE(unit_num, '')) <> '' THEN ' #' || TRIM(unit_num) ELSE '' END
        )
      ) AS address,
      city,
      zip_code
    FROM buildings
    LIMIT 10
""", fetch_all=True)

for i, r in enumerate(rows or [], 1):
    a = r['address'] or ''
    c = r['city'] or ''
    z = r['zip_code'] or ''
    print(f"{i}. {a} {c} {z}".strip())

print("\nVoters (sample):")
rows = execute_query("""
    SELECT
      TRIM(
        CONCAT(
          COALESCE(CAST(street_number AS TEXT), ''),
          CASE WHEN TRIM(COALESCE(street_suffix, '')) <> '' THEN ' ' || TRIM(street_suffix) ELSE '' END,
          CASE WHEN TRIM(COALESCE(street_name, '')) <> '' THEN ' ' || TRIM(street_name) ELSE '' END,
          CASE WHEN TRIM(COALESCE(apartment, '')) <> '' THEN ' Apt ' || TRIM(apartment) ELSE '' END
        )
      ) AS address,
      zip_code
    FROM voters
    LIMIT 10
""", fetch_all=True)

for i, r in enumerate(rows or [], 1):
    a = r['address'] or ''
    z = r['zip_code'] or ''
    print(f"{i}. {a} {z}".strip())

Buildings (sample):
1.  BOSTON
2.  BOSTON
3.  BOSTON
4.  BOSTON
5.  BOSTON
6.  BOSTON
7.  BOSTON
8.  BOSTON
9.  BOSTON
10.  BOSTON

Voters (sample):
1. 142 KENRICK ST Apt 11 2135
2. 142 KENRICK ST Apt 11 2135
3. 144 KENRICK ST Apt 11 2135
4. 144 KENRICK ST Apt 11 2135
5. 185 CHESTNUT HILL AVE Apt 11 2135
6. 1999 COMMONWEALTH AVE Apt 11 2135
7. 2001 COMMONWEALTH AVE Apt 11 2135
8. 19 SOUTH ST Apt 11 2135
9. 19 SOUTH ST Apt 11 2135
10. 43 GLENVILLE AVE Apt 11 2134


In [9]:
# Better formatted addresses from buildings + voters (first 20)
import sys
sys.path.append('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/web_app')
from config.database import execute_query

print("Buildings (sample):")
bldg_rows = execute_query("""
    SELECT
      TRIM(REGEXP_REPLACE(
        CONCAT(
          COALESCE(NULLIF(REGEXP_REPLACE(TRIM(COALESCE(st_num,'')),'\\.0$',''),''), ''),
          CASE
            WHEN TRIM(COALESCE(st_num2,'')) <> '' AND TRIM(st_num2) <> TRIM(st_num)
              THEN ' ' || REGEXP_REPLACE(TRIM(st_num2),'\\.0$','')
            ELSE ''
          END,
          CASE
            WHEN TRIM(COALESCE(st_name,'')) <> '' THEN ' ' || TRIM(st_name)
            ELSE ''
          END,
          CASE
            WHEN TRIM(COALESCE(unit_num,'')) <> '' THEN ' #' || TRIM(unit_num)
            ELSE ''
          END
        ), '\\s+', ' ', 'g'
      )) AS address,
      city,
      zip_code
    FROM buildings
    WHERE COALESCE(st_name,'') <> '' OR COALESCE(st_num,'') <> ''
    LIMIT 20
""", fetch_all=True)

for i, r in enumerate(bldg_rows or [], 1):
    a = (r['address'] or '').strip()
    c = (r['city'] or '').strip()
    z = (r['zip_code'] or '').strip()
    line = " ".join(x for x in [a, c, z] if x)
    print(f"{i}. {line}")

print("\nVoters (sample):")
voter_rows = execute_query("""
    SELECT
      TRIM(
        CONCAT(
          COALESCE(CAST(street_number AS TEXT), ''),
          CASE WHEN TRIM(COALESCE(street_suffix,'')) <> '' AND LOWER(TRIM(street_suffix)) <> '0'
               THEN ' ' || TRIM(street_suffix) ELSE '' END,
          CASE WHEN TRIM(COALESCE(street_name,'')) <> '' THEN ' ' || TRIM(street_name) ELSE '' END,
          CASE WHEN TRIM(COALESCE(apartment,'')) <> '' THEN ' Apt ' || TRIM(apartment) ELSE '' END
        )
      ) AS address,
      zip_code
    FROM voters
    LIMIT 20
""", fetch_all=True)

for i, r in enumerate(voter_rows or [], 1):
    a = (r['address'] or '').strip()
    z = (r['zip_code'] or '').strip()
    line = " ".join(x for x in [a, z] if x)
    print(f"{i}. {line}")

Buildings (sample):
1. 66 Kirkwood RD BRIGHTON 2135.0
2. 240 242 Foster ST BRIGHTON 2135.0
3. 75 Wallingford RD BRIGHTON 2135.0
4. 37 39 WILTSHIRE RD BRIGHTON 2135.0
5. 172 Chiswick RD BRIGHTON 2135.0
6. 5 7 MORROW RD BRIGHTON 2135.0
7. 9 11 MORROW RD BRIGHTON 2135.0
8. 266 264 Market ST BRIGHTON 2135.0
9. 47 49 Elmira ST BRIGHTON 2135.0
10. 93 95 Etna ST BRIGHTON 2135.0
11. 52 Murdock ST BRIGHTON 2135.0
12. 26 24 Mapleton ST BRIGHTON 2135.0
13. 1747 COMMONWEALTH AV BRIGHTON 2135.0
14. 44 LEAMINGTON RD BRIGHTON 2135.0
15. 24 LITCHFIELD ST BRIGHTON 2135.0
16. 47 49 S WAVERLY ST BRIGHTON 2135.0
17. 46 LEAMINGTON RD BRIGHTON 2135.0
18. 20 LEAMINGTON RD BRIGHTON 2135.0
19. 70 NOTTINGHILL RD BRIGHTON 2135.0
20. 36 Colborne RD BRIGHTON 2135.0

Voters (sample):
1. 142 KENRICK ST Apt 11 2135
2. 142 KENRICK ST Apt 11 2135
3. 144 KENRICK ST Apt 11 2135
4. 144 KENRICK ST Apt 11 2135
5. 185 CHESTNUT HILL AVE Apt 11 2135
6. 1999 COMMONWEALTH AVE Apt 11 2135
7. 2001 COMMONWEALTH AVE Apt 11 2135
8. 1

In [13]:
# Create + populate voters_buildings_map in one committed transaction
import sys
sys.path.append('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/web_app')
from config.database import get_db_connection

sql_create = """
CREATE TABLE IF NOT EXISTS voters_buildings_map (
  res_id        text NOT NULL,
  struct_id     text NOT NULL,
  base_key      text NOT NULL,
  matched_on    text NOT NULL DEFAULT 'num+street',
  created_at    timestamp DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_vbm_res_id    ON voters_buildings_map (res_id);
CREATE INDEX IF NOT EXISTS idx_vbm_struct_id ON voters_buildings_map (struct_id);
"""

sql_truncate = "TRUNCATE voters_buildings_map;"

sql_insert = """
WITH b_raw AS (
  SELECT
    b.struct_id,
    NULLIF(TRIM(CAST(CAST(b.st_num  AS float) AS int)::text),'') AS num1,
    NULLIF(TRIM(CAST(CAST(b.st_num2 AS float) AS int)::text),'') AS num2,
    INITCAP(TRIM(b.st_name)) AS street
  FROM buildings b
  WHERE b.st_name IS NOT NULL AND b.st_num IS NOT NULL
),
b_expanded AS (
  SELECT struct_id, num1::int AS num, street FROM b_raw WHERE num1 IS NOT NULL
  UNION ALL
  SELECT struct_id, num2::int AS num, street FROM b_raw WHERE num2 IS NOT NULL AND num2 <> num1
),
b_keys AS (
  SELECT struct_id, LOWER(CONCAT(num, ' ', street)) AS base_key
  FROM b_expanded
),
v_keys AS (
  SELECT
    v.res_id,
    LOWER(CONCAT(TRIM(CAST(v.street_number AS text)), ' ', INITCAP(TRIM(v.street_name)))) AS base_key
  FROM voters v
  WHERE v.street_number IS NOT NULL AND v.street_name IS NOT NULL
)
INSERT INTO voters_buildings_map (res_id, struct_id, base_key)
SELECT v.res_id, b.struct_id, v.base_key
FROM v_keys v
JOIN b_keys b USING (base_key);
"""

sql_count = "SELECT COUNT(*) FROM voters_buildings_map;"

conn = get_db_connection()
if not conn:
    print("DB connection failed")
else:
    try:
        cur = conn.cursor()
        cur.execute(sql_create)
        cur.execute(sql_truncate)
        cur.execute(sql_insert)
        conn.commit()
        cur.execute(sql_count)
        print("Matches loaded:", cur.fetchone()[0])
    except Exception as e:
        conn.rollback()
        print("Error:", e)
    finally:
        conn.close()

Matches loaded: 27769


In [14]:
import sys
sys.path.append('/Users/Studies/Projects/ds-abcdc-allston/fa25-team-a/web_app')
from config.database import execute_query

rows = execute_query("SELECT res_id, struct_id, base_key, matched_on, created_at FROM voters_buildings_map LIMIT 20;", fetch_all=True) or []
print("voters_buildings_map (first 20):")
for i, r in enumerate(rows, 1):
    print(f"{i}. res_id={r['res_id']}  struct_id={r['struct_id']}  base_key={r['base_key']}  matched_on={r['matched_on']}  created_at={r['created_at']}")

cnt = execute_query("SELECT COUNT(*) AS n FROM voters_buildings_map;", fetch_one=True)
print(f"\nTotal rows: {cnt['n'] if cnt else 0}")

voters_buildings_map (first 20):
1. res_id=04WDD1697001  struct_id=229910_900950  base_key=1 adamson st  matched_on=num+street  created_at=2025-10-30 00:56:55.812178
2. res_id=06BAM1991000  struct_id=229918_901024  base_key=1 aldie st  matched_on=num+street  created_at=2025-10-30 00:56:55.812178
3. res_id=10HTE0692000  struct_id=229918_901024  base_key=1 aldie st  matched_on=num+street  created_at=2025-10-30 00:56:55.812178
4. res_id=01GML2984002  struct_id=229918_901024  base_key=1 aldie st  matched_on=num+street  created_at=2025-10-30 00:56:55.812178
5. res_id=10BDL1588000  struct_id=229918_901024  base_key=1 aldie st  matched_on=num+street  created_at=2025-10-30 00:56:55.812178
6. res_id=07DJN1394002  struct_id=229918_901024  base_key=1 aldie st  matched_on=num+street  created_at=2025-10-30 00:56:55.812178
7. res_id=04HLO1293000  struct_id=229918_901024  base_key=1 aldie st  matched_on=num+street  created_at=2025-10-30 00:56:55.812178
8. res_id=02CLA2673004  struct_id=228069_899929 