### Data sources:

**1) NYS regents results (2022-2023):**

Report Card Database (332.61 megabytes)
This Access database contains accountability statuses and indicator data (performance, chronic absenteeism, graduation rate, participation rate), assessment results (elementary- and intermediate-level ELA, Math, and Science; Annual Regents; Total Cohort Regents; NYSESLAT; NYSAA), and postsecondary enrollment rate for the state, districts, public and charter schools, by county, and Need to Resource Capacity group. The database also includes Staff Qualifications and Expenditures per Pupil for the state, districts, and public and charter schools.<br>
https://data.nysed.gov/downloads.php<br>
https://data.nysed.gov/files/essa/22-23/SRC2023.zip<br>

**2) NYS high schools enrollment:**

Enrollment Database (6.27 megabytes)
This database contains school, district, county, and statewide enrollment by grade, race/ethnicity, gender, and other groups.<br>
https://data.nysed.gov/downloads.php<br>
https://data.nysed.gov/files/enrollment/22-23/enrollment_2023.zip<br>

**3) Schools locations**

NYS GIS Clearinghouse: NYS Schools<br>
https://data.gis.ny.gov/maps/b6c624c740e4476689aa60fdc4aacb8f/about

### Imports

In [1]:
import os
import sys
import pandas as pd
import geopandas as gpd
# import matplotlib.pyplot as plt
import folium
# from shapely.geometry import Point
# from fuzzywuzzy import process
# import fuzzywuzzy
# import base64
# from io import BytesIO
import math
from tqdm import tqdm
import importlib
# import utils


pd.set_option('display.float_format', '{:.3f}'.format)

In [2]:
parent_dir = 'C:\\GITHUB\\NY_schools_maps\\notebooks'
sys.path.append(parent_dir)

In [3]:
# importlib.reload(utils)
from utils import match_name



In [4]:
basePath = r"G:\My Drive\Kids\NYC_schools_mapped"
dataFolder = r"raw_data"
outputFolder = r"processed_data"

In [5]:
# Read GeoJSON into data frame
schoolsFile = 'NYS_Schools.geojson'
NYSSchoolsPath = os.path.join(basePath, dataFolder, schoolsFile)
print(NYSSchoolsPath)
NYSSchoolsGeom = gpd.read_file(NYSSchoolsPath)

del schoolsFile, NYSSchoolsPath

G:\My Drive\Kids\NYC_schools_mapped\raw_data\NYS_Schools.geojson


In [6]:
# Read file with number of diplomas
fileName_diplomas = "Regents results by school and cohorts.xlsx"
diplomasPath = os.path.join(basePath,dataFolder,fileName_diplomas)
print(diplomasPath)
diplomasDF = pd.read_excel(diplomasPath)

del fileName_diplomas, diplomasPath

G:\My Drive\Kids\NYC_schools_mapped\raw_data\Regents results by school and cohorts.xlsx


In [7]:
# Read file with regents scores
fileName_scores = "Regents results by school_2.xlsx"
scoresPath = os.path.join(basePath, dataFolder, fileName_scores)
print(scoresPath)
scoresDF = pd.read_excel(scoresPath)

del fileName_scores, scoresPath

G:\My Drive\Kids\NYC_schools_mapped\raw_data\Regents results by school_2.xlsx


In [8]:
# Read file with school enrollment
fileName_enrollment = "Number of high schoolers by school.xlsx"
enrollmentPath = os.path.join(basePath,dataFolder,fileName_enrollment)
print(enrollmentPath)
enrollmentDF = pd.read_excel(enrollmentPath)

del fileName_enrollment, enrollmentPath

G:\My Drive\Kids\NYC_schools_mapped\raw_data\Number of high schoolers by school.xlsx


_______________________________________________________________________________

Schools names are stored:
- in the *diplomasDF* in the column *'aggregation_name'*
- in the *enrollmentDF* in the column *'ENTITY_NAME'*
- in the *scoresDF* in the column *'ENTITY_NAME'*
- in the *NYSSchoolsGeom* in the column *'LEGAL_NAME'*
________________________________________________________________________________

### Preparing data

#### By dataframes

##### Processing datafreame with numbers of graduates and types of diplomas

In [9]:
diplomasDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7746 entries, 0 to 7745
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   report_school_year          7746 non-null   object
 1   aggregation_type            7746 non-null   object
 2   aggregation_name            7746 non-null   object
 3   subgroup_name               7746 non-null   object
 4   grad_cnt                    7746 non-null   object
 5   local_cnt                   7746 non-null   object
 6   reg_cnt                     7746 non-null   object
 7   reg_adv_cnt                 7746 non-null   object
 8   non_diploma_credential_cnt  7746 non-null   object
 9   still_enr_cnt               7746 non-null   object
 10  ged_cnt                     7746 non-null   object
 11  dropout_cnt                 7746 non-null   object
 12  membership_code             7746 non-null   int64 
 13  membership_desc             7746 non-null   obje

In [10]:
# Checking the school year covered
diplomasDF['report_school_year'].unique()

array(['2022-23'], dtype=object)

In [11]:
# Converting the columns in diplomasDF with numbers saved as objects into numeric 
columns_to_convert = [
 'grad_cnt',
 'local_cnt',
 'reg_cnt',
 'reg_adv_cnt',
 'non_diploma_credential_cnt',
 'still_enr_cnt',
 'ged_cnt',
 'dropout_cnt']
diplomasDF[columns_to_convert] = diplomasDF[columns_to_convert].apply(pd.to_numeric, errors='coerce')

del columns_to_convert

In [12]:
# Selecting only numbers for cohort 2019 from diplomasDF for easier comparisons
condition = diplomasDF['membership_code'] == 11
diplomasDF = diplomasDF[condition]

del condition

In [13]:
diplomasDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1295 entries, 4 to 7744
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   report_school_year          1295 non-null   object 
 1   aggregation_type            1295 non-null   object 
 2   aggregation_name            1295 non-null   object 
 3   subgroup_name               1295 non-null   object 
 4   grad_cnt                    1273 non-null   float64
 5   local_cnt                   1273 non-null   float64
 6   reg_cnt                     1273 non-null   float64
 7   reg_adv_cnt                 1273 non-null   float64
 8   non_diploma_credential_cnt  1273 non-null   float64
 9   still_enr_cnt               1273 non-null   float64
 10  ged_cnt                     1273 non-null   float64
 11  dropout_cnt                 1273 non-null   float64
 12  membership_code             1295 non-null   int64  
 13  membership_desc             1295 

##### Processing dataframe with number of students in high school grades by school

In [14]:
enrollmentDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1347 entries, 0 to 1346
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ENTITY_NAME  1347 non-null   object
 1   Year         1347 non-null   int64 
 2   12           1347 non-null   int64 
 3   11           1347 non-null   int64 
 4   10           1347 non-null   int64 
 5   9            1347 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 63.3+ KB


In [15]:
enrollmentDF.head()

Unnamed: 0,ENTITY_NAME,Year,12,11,10,9
0,ALBANY HIGH SCHOOL,2023,611,581,654,754
1,GREEN TECH HIGH CHARTER SCHOOL,2023,59,55,74,118
2,ALBANY LEADERSHIP CS-GIRLS,2023,44,61,71,86
3,BERNE-KNOX-WESTERLO JUNIOR-SENIOR HS,2023,49,58,61,52
4,BETHLEHEM CENTRAL SENIOR HIGH SCHOOL,2023,338,345,339,330


In [16]:
# Calculating total number of students in high school grades in a school in 2022-2023 school year
enrollmentDF['HStotal'] = enrollmentDF[['12', '11', '10', '9']].sum(axis = 1)

In [17]:
enrollmentDF.head()

Unnamed: 0,ENTITY_NAME,Year,12,11,10,9,HStotal
0,ALBANY HIGH SCHOOL,2023,611,581,654,754,2600
1,GREEN TECH HIGH CHARTER SCHOOL,2023,59,55,74,118,306
2,ALBANY LEADERSHIP CS-GIRLS,2023,44,61,71,86,262
3,BERNE-KNOX-WESTERLO JUNIOR-SENIOR HS,2023,49,58,61,52,220
4,BETHLEHEM CENTRAL SENIOR HIGH SCHOOL,2023,338,345,339,330,1352


##### Processing the dataframe with regents scores

In [18]:
scoresDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82829 entries, 0 to 82828
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   SCHOOL_NAME    82829 non-null  object
 1   YEAR           82829 non-null  int64 
 2   SUBGROUP_NAME  82829 non-null  object
 3   SUBJECT        82829 non-null  object
 4   NUM_LEVEL1     82829 non-null  object
 5   NUM_LEVEL2     82829 non-null  object
 6   NUM_LEVEL3     82829 non-null  object
 7   NUM_LEVEL4     82829 non-null  object
 8   NUM_LEVEL5     82829 non-null  object
 9   TOTAL_EXEMPT   82829 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 6.3+ MB


In [19]:
# Converting the columns in scoresDF with numbers saved as objects into numeric 
columns_to_convert = [
 'NUM_LEVEL1',
 'NUM_LEVEL2',
 'NUM_LEVEL3',
 'NUM_LEVEL4',
 'NUM_LEVEL5',
 ]
scoresDF[columns_to_convert] = scoresDF[columns_to_convert].apply(pd.to_numeric, errors='coerce')

del columns_to_convert

In [20]:
# Checking school years covered
scoresDF['YEAR'].unique()

array([2022, 2023], dtype=int64)

In [21]:
# Leaving only the results for the school year 2022-23
condition = scoresDF['YEAR'] == 2023
scoresDF_2023 = scoresDF[condition]

del condition

In [22]:
scoresDF_2023.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40827 entries, 3 to 82828
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SCHOOL_NAME    40827 non-null  object 
 1   YEAR           40827 non-null  int64  
 2   SUBGROUP_NAME  40827 non-null  object 
 3   SUBJECT        40827 non-null  object 
 4   NUM_LEVEL1     39366 non-null  float64
 5   NUM_LEVEL2     39366 non-null  float64
 6   NUM_LEVEL3     39366 non-null  float64
 7   NUM_LEVEL4     39366 non-null  float64
 8   NUM_LEVEL5     39366 non-null  float64
 9   TOTAL_EXEMPT   40827 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 3.4+ MB


In [23]:
scoresDF_2023['YEAR'].unique()

array([2023], dtype=int64)

____________________________________________________________________________________________
The schools will be compared by the pseudo regents scores. Those scores (S) are the share of scores at each level 1-5 (N) gotten in all instances of taking a regents exam regardless of discipline:<br>
S = $\sum$ of instances where a student obtained level N at all regents taken by students this school year / $\sum$ of all instances where a students took a regents exam
____________________________________________________________________________________________

In [24]:
# Calculating the pseudo regents scores by school
scoresDF_2023_bySchools = scoresDF_2023.groupby(['SCHOOL_NAME'])[['NUM_LEVEL1', 'NUM_LEVEL2', 'NUM_LEVEL3', 'NUM_LEVEL4', 'NUM_LEVEL5']].sum()
scoresDF_2023_bySchNorm = scoresDF_2023_bySchools.div(scoresDF_2023_bySchools.sum(axis=1), axis = 0)
scoresDF_2023_bySchNorm.reset_index(inplace=True)

In [25]:
scoresDF_2023_bySchNorm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2586 entries, 0 to 2585
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SCHOOL_NAME  2586 non-null   object 
 1   NUM_LEVEL1   2544 non-null   float64
 2   NUM_LEVEL2   2544 non-null   float64
 3   NUM_LEVEL3   2544 non-null   float64
 4   NUM_LEVEL4   2544 non-null   float64
 5   NUM_LEVEL5   2544 non-null   float64
dtypes: float64(5), object(1)
memory usage: 121.3+ KB


In [26]:
scoresDF_2023_bySchNorm.head()

Unnamed: 0,SCHOOL_NAME,NUM_LEVEL1,NUM_LEVEL2,NUM_LEVEL3,NUM_LEVEL4,NUM_LEVEL5
0,30TH AVENUE SCHOOL,0.012,0.012,0.131,0.536,0.31
1,30TH AVENUE SCHOOL (THE),0.012,0.012,0.131,0.536,0.31
2,47 AMER SIGN LANG & ENG LOWER,,,,,
3,A A KINGSTON MIDDLE SCHOOL,0.0,0.0,0.034,0.552,0.414
4,A D OLIVER MIDDLE SCHOOL,0.0,0.0,0.188,0.594,0.217


#### Joining the data into single geodataframe for mapping

In [42]:
del joinedDF

In [43]:
# Joining with enrollmentDF with scoresDF_2023_bySchNorm

joinedDF = enrollmentDF.merge(scoresDF_2023_bySchNorm, left_on = 'ENTITY_NAME', right_on = 'SCHOOL_NAME', how = 'left')

In [44]:
joinedDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1347 entries, 0 to 1346
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ENTITY_NAME  1347 non-null   object 
 1   Year         1347 non-null   int64  
 2   12           1347 non-null   int64  
 3   11           1347 non-null   int64  
 4   10           1347 non-null   int64  
 5   9            1347 non-null   int64  
 6   HStotal      1347 non-null   int64  
 7   SCHOOL_NAME  1256 non-null   object 
 8   NUM_LEVEL1   1252 non-null   float64
 9   NUM_LEVEL2   1252 non-null   float64
 10  NUM_LEVEL3   1252 non-null   float64
 11  NUM_LEVEL4   1252 non-null   float64
 12  NUM_LEVEL5   1252 non-null   float64
dtypes: float64(5), int64(6), object(2)
memory usage: 147.3+ KB


In [45]:
# Deleting rows relevant to school districts
joinedDF = joinedDF[~joinedDF['ENTITY_NAME'].str.contains('NYC GEOG DIST')]
joinedDF = joinedDF[~joinedDF['ENTITY_NAME'].str.contains('HS DISTRICT')]
joinedDF = joinedDF[~joinedDF['ENTITY_NAME'].str.contains('CSD')]


In [46]:
joinedDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1297 entries, 0 to 1346
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ENTITY_NAME  1297 non-null   object 
 1   Year         1297 non-null   int64  
 2   12           1297 non-null   int64  
 3   11           1297 non-null   int64  
 4   10           1297 non-null   int64  
 5   9            1297 non-null   int64  
 6   HStotal      1297 non-null   int64  
 7   SCHOOL_NAME  1255 non-null   object 
 8   NUM_LEVEL1   1251 non-null   float64
 9   NUM_LEVEL2   1251 non-null   float64
 10  NUM_LEVEL3   1251 non-null   float64
 11  NUM_LEVEL4   1251 non-null   float64
 12  NUM_LEVEL5   1251 non-null   float64
dtypes: float64(5), int64(6), object(2)
memory usage: 141.9+ KB


In [47]:
joinedDF = joinedDF.merge(diplomasDF, left_on = 'ENTITY_NAME', right_on = 'aggregation_name', how='left')

In [48]:
joinedDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1299 entries, 0 to 1298
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ENTITY_NAME                 1299 non-null   object 
 1   Year                        1299 non-null   int64  
 2   12                          1299 non-null   int64  
 3   11                          1299 non-null   int64  
 4   10                          1299 non-null   int64  
 5   9                           1299 non-null   int64  
 6   HStotal                     1299 non-null   int64  
 7   SCHOOL_NAME                 1257 non-null   object 
 8   NUM_LEVEL1                  1253 non-null   float64
 9   NUM_LEVEL2                  1253 non-null   float64
 10  NUM_LEVEL3                  1253 non-null   float64
 11  NUM_LEVEL4                  1253 non-null   float64
 12  NUM_LEVEL5                  1253 non-null   float64
 13  report_school_year          818 n

In [32]:
# Deleting rows relevant to school districts
joinedDF = joinedDF[~joinedDF['ENTITY_NAME'].str.contains('NYC GEOG DIST')]
joinedDF = joinedDF[~joinedDF['ENTITY_NAME'].str.contains('HS DISTRICT')]
joinedDF = joinedDF[~joinedDF['ENTITY_NAME'].str.contains('CSD')]

In [33]:
joinedDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1299 entries, 0 to 1298
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ENTITY_NAME                 1299 non-null   object 
 1   Year                        1299 non-null   int64  
 2   12                          1299 non-null   int64  
 3   11                          1299 non-null   int64  
 4   10                          1299 non-null   int64  
 5   9                           1299 non-null   int64  
 6   HStotal                     1299 non-null   int64  
 7   SCHOOL_NAME                 1257 non-null   object 
 8   NUM_LEVEL1                  1253 non-null   float64
 9   NUM_LEVEL2                  1253 non-null   float64
 10  NUM_LEVEL3                  1253 non-null   float64
 11  NUM_LEVEL4                  1253 non-null   float64
 12  NUM_LEVEL5                  1253 non-null   float64
 13  report_school_year          818 n

In [35]:
joinedDF.head()

Unnamed: 0,ENTITY_NAME,Year,12,11,10,9,HStotal,SCHOOL_NAME,NUM_LEVEL1,NUM_LEVEL2,...,grad_cnt,local_cnt,reg_cnt,reg_adv_cnt,non_diploma_credential_cnt,still_enr_cnt,ged_cnt,dropout_cnt,membership_code,membership_desc
0,ALBANY HIGH SCHOOL,2023,611,581,654,754,2600,ALBANY HIGH SCHOOL,0.301,0.214,...,548.0,0.0,417.0,131.0,6.0,76.0,0.0,74.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023
1,GREEN TECH HIGH CHARTER SCHOOL,2023,59,55,74,118,306,GREEN TECH HIGH CHARTER SCHOOL,0.539,0.24,...,49.0,0.0,33.0,16.0,0.0,5.0,0.0,0.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023
2,ALBANY LEADERSHIP CS-GIRLS,2023,44,61,71,86,262,ALBANY LEADERSHIP CS-GIRLS,0.356,0.214,...,50.0,0.0,41.0,9.0,0.0,1.0,0.0,4.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023
3,BERNE-KNOX-WESTERLO JUNIOR-SENIOR HS,2023,49,58,61,52,220,BERNE-KNOX-WESTERLO JUNIOR-SENIOR HS,0.1,0.11,...,,,,,,,,,,
4,BETHLEHEM CENTRAL SENIOR HIGH SCHOOL,2023,338,345,339,330,1352,BETHLEHEM CENTRAL SENIOR HIGH SCHOOL,0.01,0.026,...,333.0,0.0,110.0,223.0,6.0,2.0,0.0,8.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023


In [36]:
list(NYSSchoolsGeom.columns)

['OBJECTID',
 'LEGAL_NAME',
 'PHYSADDRLINE1',
 'PHYSADDRLINE2',
 'PHYSCITY',
 'PHYSICALSTATE',
 'PHYSZIPCD5',
 'COUNTY_DESC',
 'Contact_Name',
 'CEO_TITLE',
 'CEO_PHONENUM',
 'CEO_EMAIL',
 'INST_TYPE_DESC',
 'INSTSUBTYPDESC',
 'RECORD_TYPE_DESC',
 'COMMUNITY_TYPE_DESC',
 'DIST_TYPE_DESC',
 'SDL_DESC',
 'INSTIT_ID',
 'SED_CODE',
 'geometry']

In [37]:
NYSSchoolsGeom.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 8125 entries, 0 to 8124
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   OBJECTID             8125 non-null   int64   
 1   LEGAL_NAME           8125 non-null   object  
 2   PHYSADDRLINE1        8125 non-null   object  
 3   PHYSADDRLINE2        8125 non-null   object  
 4   PHYSCITY             8125 non-null   object  
 5   PHYSICALSTATE        8125 non-null   object  
 6   PHYSZIPCD5           8125 non-null   object  
 7   COUNTY_DESC          8125 non-null   object  
 8   Contact_Name         8125 non-null   object  
 9   CEO_TITLE            8125 non-null   object  
 10  CEO_PHONENUM         7918 non-null   object  
 11  CEO_EMAIL            8125 non-null   object  
 12  INST_TYPE_DESC       8125 non-null   object  
 13  INSTSUBTYPDESC       8125 non-null   object  
 14  RECORD_TYPE_DESC     8125 non-null   object  
 15  COMMUNITY_TYP

In [38]:
columns_to_drop = [
 'PHYSADDRLINE1',
 'PHYSADDRLINE2',
 'PHYSCITY',
 'PHYSICALSTATE',
 'PHYSZIPCD5',
 'COUNTY_DESC',
 'Contact_Name',
 'CEO_TITLE',
 'CEO_PHONENUM',
 'CEO_EMAIL',
 'INST_TYPE_DESC',
 'RECORD_TYPE_DESC',
 'COMMUNITY_TYPE_DESC',
 'DIST_TYPE_DESC',
 'INSTIT_ID',
 'SED_CODE']
NYSSchoolsGeom_short = NYSSchoolsGeom.drop(columns_to_drop, axis = 1)

In [49]:
# Matching the school data file with spatial data (geojson of schools locations)

tqdm.pandas(desc="Matching Names")

matched_tuples = joinedDF['ENTITY_NAME'].progress_apply(
    lambda x: match_name(x, NYSSchoolsGeom_short['LEGAL_NAME'], min_score=73))

print('Done.')

Matching Names: 100%|██████████████████████████████████████████████████████████████| 1299/1299 [21:04<00:00,  1.03it/s]

Done.





In [59]:
NYSSchoolsGeom_short.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 8125 entries, 0 to 8124
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   OBJECTID        8125 non-null   int64   
 1   LEGAL_NAME      8125 non-null   object  
 2   INSTSUBTYPDESC  8125 non-null   object  
 3   SDL_DESC        8125 non-null   object  
 4   geometry        8125 non-null   geometry
dtypes: geometry(1), int64(1), object(3)
memory usage: 317.5+ KB


In [50]:
print('Appending mathes to the dataframe.')
joinedDF['matched_name'] = list(zip(*matched_tuples))[0]
joinedDF['matched_score'] = list(zip(*matched_tuples))[1]
print('Done.')

Appending mathes to the dataframe.
Done.


In [61]:
joinedDF.head()

Unnamed: 0,ENTITY_NAME,Year,12,11,10,9,HStotal,SCHOOL_NAME,NUM_LEVEL1,NUM_LEVEL2,...,reg_cnt,reg_adv_cnt,non_diploma_credential_cnt,still_enr_cnt,ged_cnt,dropout_cnt,membership_code,membership_desc,matched_name,matched_score
0,ALBANY HIGH SCHOOL,2023,611,581,654,754,2600,ALBANY HIGH SCHOOL,0.301,0.214,...,417.0,131.0,6.0,76.0,0.0,74.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023,ALBANY HIGH SCHOOL,100
1,GREEN TECH HIGH CHARTER SCHOOL,2023,59,55,74,118,306,GREEN TECH HIGH CHARTER SCHOOL,0.539,0.24,...,33.0,16.0,0.0,5.0,0.0,0.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023,GREEN TECH HIGH CHARTER SCHOOL,100
2,ALBANY LEADERSHIP CS-GIRLS,2023,44,61,71,86,262,ALBANY LEADERSHIP CS-GIRLS,0.356,0.214,...,41.0,9.0,0.0,1.0,0.0,4.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023,ALBANY LEADERSHIP CMS FOR GIRLS,91
3,BERNE-KNOX-WESTERLO JUNIOR-SENIOR HS,2023,49,58,61,52,220,BERNE-KNOX-WESTERLO JUNIOR-SENIOR HS,0.1,0.11,...,,,,,,,,,BERNE-KNOX-WESTERLO JUNIOR-SENIOR HIGH SCHOOL,86
4,BETHLEHEM CENTRAL SENIOR HIGH SCHOOL,2023,338,345,339,330,1352,BETHLEHEM CENTRAL SENIOR HIGH SCHOOL,0.01,0.026,...,110.0,223.0,6.0,2.0,0.0,8.0,11.0,2019 Total Cohort - 4 Year Outcome - August 2023,BETHLEHEM CENTRAL SENIOR HIGH SCHOOL,100


In [51]:
name = 'joinedDF_tempMatched7.csv'
path = os.path.join(basePath, outputFolder, name)
print(f'Saving to {path} ...')
joinedDF.to_csv(path)
print('Saved.')
del name, path

Saving to G:\My Drive\Kids\NYC_schools_mapped\processed_data\joinedDF_tempMatched7.csv ...
Saved.


In [53]:
unmatched = {
    'NY MILLS SCHOOL':'',
    'PS 469-BRONX SCHOOL-CON LRN':'',
    'SULLIVAN WEST HIGH SCHOOL':'',
    'HIGH SCHOOL-ENTERPRISE, BUS & TECH':'',
    'BRONX ENG & TECH ACADEMY':'',
    'GEORGE WASHINGTON CARVER HS':'',
    'INTER PREP SCHOOL (THE)':'',
    'ROBERT H GODDARD HS-COMM/TECH':'',
    'MATH, ENG, SCI ACADEMY CHARTER HS':'',
    'FIORELLO H LAGUARDIA HIGH SCHOOL':'FIORELLO H LAGUARDIA HIGH SCHOOL OF MUSIC ART AND PERFORMING ARTS',
    'MT VERNON SCHOOL DISTRICT':'',
    'CITY POLYTECHNIC HIGH SCHOOL':'CITY POLYTECHNIC HIGH SCHOOL OF ENGINEERING ARCHITECTURE AND TECHNOLOGY',
    'BROOKLYN COLLEGIATE':'BROOKLYN COLLEGIATE: A COLLEGE BOARD SCHOOL'
}

In [54]:
# Replacing the erroneus matches in the allResultsDF_2023 data frame

def replace_values(row):
    if row['ENTITY_NAME'] in unmatched:
        row['matched_name'] = unmatched[row['ENTITY_NAME']]
    return row

joinedDF = joinedDF.apply(replace_values, axis = 1)

In [55]:
# Merging dataframes based on the matched name

finalGeoDF = pd.merge(NYSSchoolsGeom_short, joinedDF, left_on='LEGAL_NAME', right_on='matched_name')
allData_Name = 'NYSHS_regents.geojson'
allData_Path = os.path.join(basePath,outputFolder, allData_Name)
print(f'Saving to {allData_Path} ...')
finalGeoDF.to_file(allData_Path, driver="GeoJSON")
print('Saved.')

del allData_Name, allData_Path

Saving to G:\My Drive\Kids\NYC_schools_mapped\processed_data\NYSHS_regents.geojson ...
Saved.


In [56]:
finalGeoDF.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1237 entries, 0 to 1236
Data columns (total 34 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   OBJECTID                    1237 non-null   int64   
 1   LEGAL_NAME                  1237 non-null   object  
 2   INSTSUBTYPDESC              1237 non-null   object  
 3   SDL_DESC                    1237 non-null   object  
 4   geometry                    1237 non-null   geometry
 5   ENTITY_NAME                 1237 non-null   object  
 6   Year                        1237 non-null   int64   
 7   12                          1237 non-null   int64   
 8   11                          1237 non-null   int64   
 9   10                          1237 non-null   int64   
 10  9                           1237 non-null   int64   
 11  HStotal                     1237 non-null   int64   
 12  SCHOOL_NAME                 1197 non-null   object  
 13  NUM_LEVEL1

finalGeoDF['reg_adv_pct'] = finalGeoDF['reg_adv_cnt']/finalGeoDF['12']

### Generating the map

In [57]:
from IPython.core.display import display, HTML

display(HTML("<style>.output_scroll { height: auto !important; max-height: 1500px; }</style>"))

# Create a map object, centered at NYS
mapNYS = folium.Map(location=[42.7000, -74.2179], zoom_start=8, tiles="cartodb positron")
   
# Add dataframes with coordinates and test results to the map

def my_style(x):
    regntsLVL5 = x['properties']['NUM_LEVEL5']
    charter = x['properties']['INSTSUBTYPDESC']
    color = '#f0a607' if charter == 'CHARTER SCHOOL'  else '#f0a607' if charter == 'SATELLITE SITE FOR CHARTER SCHOOLS' else '#06a6cf'
    if regntsLVL5 is None:
        regntsLVL5 = 0
    return {
        "radius": (regntsLVL5)*2000,
        "color": color,
    }  


# Function to create iframe for a given row
def create_iframe(row):    
    html =  '<strong>{0}:</strong> {1}<br><strong>{2}:</strong> {3}<br><strong>{4}:</strong> {5}\
    <br><strong>{6}:</strong> {7}<br><strong>{8}:</strong> {9}'.format(
        'School Name', row['LEGAL_NAME'],
        'Advance Regents Diplomas count 2022-23', round(row['reg_adv_cnt'], 2),
        'Number of students in grades 9-12', row['HStotal'],
        'Share of regents level 5 score', round(row['NUM_LEVEL5'], 2),
        'Share of regents level 4 score', round(row['NUM_LEVEL4'], 2)
    )
    return folium.IFrame(html, width=500, height=150)

def create_popup(x):
    iframe = create_iframe(x)
    popup = folium.Popup(iframe)
    return popup

# Iterate over the GeoDataFrame and add a popup to each feature
for _, row in tqdm(finalGeoDF.iterrows(), total = len(finalGeoDF)):
    iframe = create_iframe(row)
        
    data = gpd.GeoDataFrame(row.to_frame().T, crs=finalGeoDF.crs)
    
    folium.GeoJson(
    data,
    marker = folium.Circle(radius=10, fill_color='white', fill_opacity=0, color="green", weight=2),
    #marker = folium.Circle(radius=10),    
    popup = folium.Popup(iframe),
    style_function = my_style, 
    control = False    
    #zoom_on_click = True,    
).add_to(mapNYS)    
        
folium.LayerControl().add_to(mapNYS)    
  
# # Display the map
# mapNYC

# Save map to html
mfile = 'NYSHS_quality.html'
mpath = os.path.join(basePath, outputFolder, mfile)
print(f'Saving to {mpath} ...')
mapNYS.save(mpath)
print('Saved.')

100%|██████████████████████████████████████████████████████████████████████████████| 1237/1237 [00:49<00:00, 25.13it/s]


Saving to G:\My Drive\Kids\NYC_schools_mapped\processed_data\NYSHS_quality.html ...
Saved.


In [None]:
sq