# ATTOM Geodata Exploration

Working with sample data provided directly from ATTOM - not available on the open web. 

Schema:
https://docs.google.com/spreadsheets/d/1TgeTa85L_G7zMHxJfGDcqoE71IogJ1mmqXW90qelTpE/edit#gid=1353969199
    
**NOTE**: You may use [nbviewer](https://nbviewer.org/) to view this notebook outside your local Jupyter environment.  

In [1]:
# pip install geopandas

In [2]:
# pip install folium

In [3]:
# IMPORTS
import geopandas as gpd
import pandas as pd

import os
import urllib.request
import requests
import shutil
from pathlib import Path
from zipfile import ZipFile

import matplotlib.pyplot as plt
from matplotlib import pyplot

import folium

from shapely.geometry import Point, Polygon

ATTOM appears to break up all the school attendance zones into per-grade Shapefiles.

In [4]:
# import OS
import os
filepath = '/Users/preston.mattox@finalsite.com/Downloads/attom/school-attendance-areas-9c2f0046-shp/'

for x in os.listdir(filepath):
    if x.endswith(".shp"):
        # Prints only text file present in My Folder
        print(x)

school-attendance-areas-EE.shp
school-attendance-areas-08.shp
school-attendance-areas-09.shp
school-attendance-areas-PK.shp
school-attendance-areas-07.shp
school-attendance-areas-12.shp
school-attendance-areas-06.shp
school-attendance-areas-10.shp
school-attendance-areas-04.shp
school-attendance-areas-05.shp
school-attendance-areas-11.shp
school-attendance-areas-01.shp
school-attendance-areas-KG.shp
school-attendance-areas-02.shp
school-attendance-areas-03.shp


## School Boundaries (Shapefile)

In [5]:
# consider makine this call a RANDOM file from the above list in order to demostrate consistency 
file = 'school-attendance-areas-05.shp'

In [6]:
saz = gpd.read_file(filepath+file)

# column names to lowercase
saz.columns= saz.columns.str.lower()

saz.head(3)

Unnamed: 0,id,schoolid,schoolnm,ncesschid,districtid,districtnm,ncesdistid,distlevel,schooltype,instrlevel,...,extradist,unassigned,prmostgrad,prhighgrad,prarea,prareakg12,areasqmi,longitude,latitude,geometry
0,ce2e23c1fc681ad9e5ca90c91402653e,0dc7376e55931c92a2d7f62d70994a81,Sunset Elementary School,064098008669,68635c93a355d9732ad2c9a75356120b,Ventura Unified School District,640980,Unified,Regular,Middle,...,N,N,N,N,N,N,38.653768,-119.326453,34.405422,"POLYGON ((-119.35745 34.46381, -119.34509 34.4..."
1,8e98c38e2ea994fc55ec03d204330433,445e174292f61523ca8534285551ec97,Unassigned,0640980G0001,68635c93a355d9732ad2c9a75356120b,Ventura Unified School District,640980,Unified,Regular,High,...,N,Y,N,N,N,N,127.477703,-119.387768,34.015617,"MULTIPOLYGON (((-119.33093 34.06506, -119.3211..."
2,5ae1641740f6072acbb027674050bd66,32bdbefff043a49402060b079a98012c,Juanamaria Elementary School,064098006754,68635c93a355d9732ad2c9a75356120b,Ventura Unified School District,640980,Unified,Regular,Primary,...,N,N,Y,Y,Y,Y,6.919899,-119.196911,34.30793,"MULTIPOLYGON (((-119.15528 34.27559, -119.1551..."


In [7]:
# examine datatypes within the dataset
print(saz.geom_type.unique())

pd.set_option('display.max_rows', len(saz.columns))

print(saz.dtypes)


['Polygon' 'MultiPolygon']
id              object
schoolid        object
schoolnm        object
ncesschid       object
districtid      object
districtnm      object
ncesdistid      object
distlevel       object
schooltype      object
instrlevel      object
gradelow        object
gradehigh       object
elemschind      object
middschind      object
highschind      object
grade           object
gradesort        int64
eeind           object
pkind           object
tkgind          object
kgind           object
tg01ind         object
g01ind          object
g02ind          object
g03ind          object
g04ind          object
g05ind          object
g06ind          object
g07ind          object
g08ind          object
g09ind          object
g10ind          object
g11ind          object
g12ind          object
ugind           object
saaeeind        object
saapkind        object
saatkgind       object
saakgind        object
saatg01ind      object
saag01ind       object
saag02ind       object
saag03i

In [8]:
# plot the data on a map via folium

# create a base map centered on Salt Lake City, UT
map_saz = folium.Map()

# set up the annotations
tooltip = folium.GeoJsonTooltip(
    fields=["schoolnm"],
    aliases=["School:"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# add the school zones on the map
folium.GeoJson(saz[['geometry','schoolnm']], 
               style_function=lambda feature: {'fillColor': '#2196F3', 'color': '#2196F3'},
              tooltip=tooltip
              ).add_to(map_saz)

# find the bounds of this data and adjust the zoom to match
bounds = saz.total_bounds.tolist()
map_saz.fit_bounds([bounds[:2][::-1], bounds[2:][::-1]])

# display the map
map_saz

In [14]:
# plot a random item for easier examination
random_saz = saz.sample(1)

# create a base map centered on Salt Lake City, UT
map_saz = folium.Map()

# set up the annotations
tooltip = folium.GeoJsonTooltip(
    fields=["schoolnm", "grade", "gradelow", "gradehigh"],
    aliases=["School:", "Grade (this layer):", "Low Grade:", "High Grade"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# add the school zones on the map
folium.GeoJson(random_saz, 
               style_function=lambda feature: {'fillColor': '#2196F3', 'color': '#2196F3'},
              tooltip=tooltip
              ).add_to(map_saz)

# find the bounds of this data and adjust the zoom to match
bounds = random_saz.total_bounds.tolist()
map_saz.fit_bounds([bounds[:2][::-1], bounds[2:][::-1]])

# display the map
map_saz

Though broken out into separate layers/shapes, each polygon in the ATTOM data inclues a binary mapping that shows whether or not the coverage area is shared across other grades. Let's take a look at that for our above zone.

In [20]:
print(random_saz["schoolnm"])
random_saz[['saaeeind'
,'saapkind'
,'saatkgind'
,'saakgind'
,'saatg01ind'
,'saag01ind'
,'saag02ind'
,'saag03ind'
,'saag04ind'
,'saag05ind'
,'saag06ind'
,'saag07ind'
,'saag08ind'
,'saag09ind'
,'saag10ind'
,'saag11ind'
,'saag12ind'
,'saaugind'
,'defactoind'
,'openenroll'
,'paroverlap'
,'fuloverlap'
,'multibndry']].transpose()

178    Gold Academy Elementary School
Name: schoolnm, dtype: object


Unnamed: 0,178
saaeeind,N
saapkind,N
saatkgind,N
saakgind,N
saatg01ind,N
saag01ind,Y
saag02ind,Y
saag03ind,Y
saag04ind,Y
saag05ind,Y


----

#TODO: look at school values - some UNASSIGNED?

Let's dig a little deeper into this data to see if the quality is good.

In [24]:
null_counts = saz.isnull().sum()
nan_counts = saz.isna().sum()
missing_counts = saz.apply(lambda x: x.isnull().sum() + x.isna().sum())

# Combine the counts into a DataFrame for better presentation
result_df = pd.DataFrame({
    'Null_Count': null_counts,
    'NaN_Count': nan_counts,
    'Missing_Count': missing_counts
})

result_df

Unnamed: 0,Null_Count,NaN_Count,Missing_Count
id,0,0,0
schoolid,0,0,0
schoolnm,0,0,0
ncesschid,0,0,0
districtid,0,0,0
districtnm,0,0,0
ncesdistid,0,0,0
distlevel,0,0,0
schooltype,0,0,0
instrlevel,0,0,0


In [36]:
# Define the values to search for (case-insensitive)
search_values = ['unknown', 'unassigned', 'missing']

# Create a case-insensitive regex pattern
pattern = '|'.join(search_values)

# Use str.contains to find rows with specified values
unk_rows = saz[saz['schoolnm'].str.contains(pattern, case=False)]

# Display the result
unk_rows[['schoolid','schoolnm','ncesschid','districtnm','gradelow','gradehigh','geometry']]

Unnamed: 0,schoolid,schoolnm,ncesschid,districtnm,gradelow,gradehigh,geometry
1,445e174292f61523ca8534285551ec97,Unassigned,0640980G0001,Ventura Unified School District,KG,12,"MULTIPOLYGON (((-119.33093 34.06506, -119.3211..."
183,a12dda1105cb3c2ef513a0b0fbc381a7,Unassigned,0628170G0001,Ocean View Elementary School District,KG,8,"POLYGON ((-119.11410 34.09592, -119.11006 34.0..."
209,05c25ac9bd6d935c04cf5ac61117ee4b,Unassigned,0617850G0001,Hueneme Elementary School District,KG,8,"POLYGON ((-119.19494 34.17350, -119.19495 34.1..."
210,cccdfe4774aeb595ed414a776b8593c1,Unassigned,0617850G0002,Hueneme Elementary School District,KG,8,"POLYGON ((-119.22840 34.15712, -119.22831 34.1..."
215,cd7a9850210fbb3540fe639acfa33399,Unassigned,0617850G0003,Hueneme Elementary School District,KG,5,"POLYGON ((-119.17846 34.14742, -119.17808 34.1..."
290,166a891c56cd677ccbc36a3211961b02,Unassigned,0636840G0001,Simi Valley Unified School District,KG,12,"MULTIPOLYGON (((-118.68551 34.25099, -118.6854..."
291,c81c90dfba518faa7dcd763226d55760,Unassigned,0636840G0002,Simi Valley Unified School District,KG,12,"POLYGON ((-118.82012 34.24191, -118.81532 34.2..."
303,0a7880257a0ff974c7aed8c0db4be12d,Unassigned,0629220G0001,Oxnard Elementary School District,PK,8,"POLYGON ((-119.24613 34.19319, -119.24608 34.1..."
357,574e1682afccbba29c6e7a54209b930d,Unassigned,2722950G0001,Mounds View Public School District,EE,12,"POLYGON ((-93.14747 45.07923, -93.15100 45.079..."
373,e72bbf1cb97c2a49f19ffd32dad0cd33,Unassigned,0627850G0001,Oak Park Unified School District,KG,12,"POLYGON ((-118.67297 34.23871, -118.66916 34.2..."


In [69]:
print("Unknown schools exist in ",round(len(unk_rows) / len(saz),3) * 100,'% of the entries in this dataset',sep="")

Unknown schools exist in 2.5% of the entries in this dataset


In [50]:
# plot the data on a map via folium
map_unk = folium.Map()

# set up the annotations
tooltip = folium.GeoJsonTooltip(
    fields=["schoolnm","districtnm"],
    aliases=["School:","District"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# add the school zones on the map
folium.GeoJson(unk_rows, 
               style_function=lambda feature: {'fillColor': '#2196F3', 'color': '#2196F3'},
              tooltip=tooltip
              ).add_to(map_unk)

# find the bounds of this data and adjust the zoom to match
bounds = unk_rows.total_bounds.tolist()
map_unk.fit_bounds([bounds[:2][::-1], bounds[2:][::-1]])

# display the map
map_unk

We will probably need to dig into each of these to figure out what's going on with the unknowns

In [12]:
# TODO: external validation

### External validation of this data

We can use the [NCES school search](https://nces.ed.gov/ccd/schoolsearch/) to spot check these. 

`490014200768` should resolve to [Quail Hollow Elementary](https://nces.ed.gov/ccd/schoolsearch/school_list.asp?Search=1&InstName=&SchoolID=490014200768&Address=&City=&State=&Zip=&Miles=&County=&PhoneAreaCode=&Phone=&DistrictName=&DistrictID=&SchoolType=1&SchoolType=2&SchoolType=3&SchoolType=4&SpecificSchlTypes=all&IncGrade=-1&LoGrade=-1&HiGrade=-1), serving grades K-5.


In [13]:
# pull Quail Hollow Elementary's NCES ID into from our df
test1 = saz_all_att.loc[saz_all_att['nces_schid'].isin(['490014200768'])][[
            'obj_id', 'obj_name','nces_schid','low_grade','high_grade','geometry']]
test1

NameError: name 'saz_all_att' is not defined

In [None]:
# map Quail Hollow Elementary
# create a base map centered on Salt Lake City, UT
map_test1 = folium.Map(
    location=[40.68, -111.89],
    zoom_start=10,
)

# set up the annotations
tooltip = folium.GeoJsonTooltip(
    fields=["obj_name", "low_grade", "high_grade"],
    aliases=["School:", "Low Grade:", "High Grade:"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# add the school zones on the map
folium.GeoJson(test1, 
               style_function=lambda feature: {'fillColor': '#2196F3', 'color': '#2196F3'},
              tooltip=tooltip
              ).add_to(map_test1)

# find the bounds of this one poly and adjust the zoom to match
bounds = test1.total_bounds.tolist()
map_test1.fit_bounds([bounds[:2][::-1], bounds[2:][::-1]])

# display the map
map_test1

Most of these look good, but there are a few trouble spots. Not sure this matters to us, but it's worth calling out.

For example, [Olene Walker Elementary](https://nces.ed.gov/ccd/schoolsearch/school_detail.asp?Search=1&InstName=OLENE+WALKER&SchoolType=1&SchoolType=2&SchoolType=3&SchoolType=4&SpecificSchlTypes=all&IncGrade=-1&LoGrade=-1&HiGrade=-1&ID=490036000259), NCES School ID: `490036000259`, doesn't resolve correctly from the Precisely data.

In [None]:
saz_all_att.loc[saz_all_att['nces_schid'].isin(['490036000259'])]

In [None]:
# ...but Olene Walker Elementary definiely is in our dataset
saz_all_att[saz_all_att['obj_name'].str.contains('OLENE', regex=True)][[
            'obj_id', 'obj_name','nces_schid','low_grade','high_grade','geometry']]