In [1]:
import sqlite3

import pandas as pd
from pathlib import Path

## Constants

In [2]:
__file__ = %pwd
DATA = Path(__file__).parent / "data"
PRIORITIES = DATA / "Priorities"
DATABASE = DATA / "databases" / "exploration.db"
DB_INSERT = True

## Load the Data

In [3]:
pub_schools_columns = [
    "CDSCode", "NCESDist", "NCESSchool", "StatusType",
    "County", "District", "School", "Street", "StreetAbr",
    "City", "Zip", "State", "MailStreet", "MailStreetAbr",
    "MailCity", "MailZip", "MailState", "Phone", "PhoneExt",
    "FaxNumber", "Website", "OpenDate", "ClosedDate", "Charter",
    "CharterNum", "FundingType", "DOC", "DOCType", "SOC",
    "SOCType", "EdOpsCode", "EdOpsName", "EILCode", "EILName",
    "GSoffered", "GSserved", "Virtual", "Magnet", "YearRound",
    "FederalDFCDistrictID", "Latitude", "Longitude", "AdmFName",
    "AdmLName", "LastUpDate", "Multilingual"
]

In [4]:
p1 = pd.read_excel(PRIORITIES / "Pr12024.xlsx")
p2 = pd.read_excel(PRIORITIES / "Pr22024.xlsx")
p3 = pd.read_excel(PRIORITIES / "Pr32024.xlsx")
p6 = pd.read_excel(PRIORITIES / "Pr62024.xlsx")
p7 = pd.read_excel(PRIORITIES / "Pr72024.xlsx")
p9 = pd.read_excel(PRIORITIES / "Pr92024.xlsx")
p10 = pd.read_excel(PRIORITIES / "Pr102024.xlsx")
pub_schools = pd.read_excel(DATA / "pubschls.xlsx", names=pub_schools_columns, skiprows=5)[:-1] # So many junk rows in this dataset
cde = pd.read_csv(DATA / "cdenroll2324-v2.txt", sep="\t")

## Data Processing

All datasets besides the `cdenroll2324-v2.txt` dataset use an amalgamated `CDSCode` rather than independent `CountyID`, etc. columns. As such, we should split this column into `county_id`, `district_id`, and `school_id` in order for ease of access to their respective names in `Counties`, `Districts`, and `Schools` tables. 

In [5]:
def get_countyCode(cds_code: int) -> int:
    county_digits = 1e12
    return int(cds_code / county_digits)

def get_districtCode(cds_code: int) -> int:
    district_digits = 1e7
    county_digits = 1e12
    return int((cds_code % county_digits) / district_digits)

def get_schoolCode(cds_code: int) -> int:
    district_digits = 1e7
    return int(cds_code % district_digits)

def cds_code_split(cds_code: int) -> pd.Series:
    return pd.Series([get_countyCode(cds_code), get_districtCode(cds_code), get_schoolCode(cds_code)])

In [6]:
p1[["CountyCode", "DistrictCode", "SchoolCode"]] = p1.apply(lambda row: cds_code_split(int(row["cdsCode"])), axis=1)
p2[["CountyCode", "DistrictCode", "SchoolCode"]] = p2.apply(lambda row: cds_code_split(int(row["cdsCode"])), axis=1)
p3[["CountyCode", "DistrictCode", "SchoolCode"]] = p3.apply(lambda row: cds_code_split(int(row["cdsCode"])), axis=1)
p6[["CountyCode", "DistrictCode", "SchoolCode"]] = p6.apply(lambda row: cds_code_split(int(row["cdsCode"])), axis=1)
p7[["CountyCode", "DistrictCode", "SchoolCode"]] = p7.apply(lambda row: cds_code_split(int(row["cdsCode"])), axis=1)
p9[["CountyCode", "DistrictCode", "SchoolCode"]] = p9.apply(lambda row: cds_code_split(int(row["cdsCode"])), axis=1)
p10[["CountyCode", "DistrictCode", "SchoolCode"]] = p10.apply(lambda row: cds_code_split(int(row["cdsCode"])), axis=1)

pub_schools[["CountyCode", "DistrictCode", "SchoolCode"]] = pub_schools.apply(
    lambda row: cds_code_split(int(row["CDSCode"])), axis=1
)

#### Coalesce every school into a single dataframe

In [7]:
cde_school_codes = cde[["SchoolCode", "SchoolName"]].drop_duplicates()
cde_school_codes_unique = cde_school_codes[cde_school_codes["SchoolCode"].notnull()]

In [8]:
pub_schools.columns

Index(['CDSCode', 'NCESDist', 'NCESSchool', 'StatusType', 'County', 'District',
       'School', 'Street', 'StreetAbr', 'City', 'Zip', 'State', 'MailStreet',
       'MailStreetAbr', 'MailCity', 'MailZip', 'MailState', 'Phone',
       'PhoneExt', 'FaxNumber', 'Website', 'OpenDate', 'ClosedDate', 'Charter',
       'CharterNum', 'FundingType', 'DOC', 'DOCType', 'SOC', 'SOCType',
       'EdOpsCode', 'EdOpsName', 'EILCode', 'EILName', 'GSoffered', 'GSserved',
       'Virtual', 'Magnet', 'YearRound', 'FederalDFCDistrictID', 'Latitude',
       'Longitude', 'AdmFName', 'AdmLName', 'LastUpDate', 'Multilingual',
       'CountyCode', 'DistrictCode', 'SchoolCode'],
      dtype='object')

In [9]:
# Check to see what is missing
pub_school_codes = pub_schools[
    (~pub_schools["SchoolCode"].isin(cde_school_codes_unique["SchoolCode"])) & (pub_schools["StatusType"] == "Active")
][["SchoolCode", "School"]]
pub_school_codes_unique = pub_school_codes[pub_school_codes["SchoolCode"].notnull()].drop_duplicates()
pub_school_codes_unique.columns = ["SchoolCode", "SchoolName"]
pub_school_codes_unique

Unnamed: 0,SchoolCode,SchoolName
7,129403,Epic Charter
41,6106751,Alameda County Special Education
56,130120,Alameda Adult
64,138214,Alameda Unified Special Education
109,122804,Berkeley Special Education Preschool
...,...,...
18221,5730080,West Sacramento School for Independent Study
18258,5738703,Woodland Adult Education
18279,5790019,Yolo County ROP
18285,141440,Yuba County Adult Education


In [10]:
school_codes = pd.concat([cde_school_codes_unique, pub_school_codes_unique]).drop_duplicates()
school_codes

Unnamed: 0,SchoolCode,SchoolName
272,112607.0,Envision Academy for Arts & Technology
295,123968.0,Community School for Creative Education
316,124172.0,Yu Ming Charter
336,125567.0,Urban Montessori Charter
356,130401.0,Alameda County Juvenile Hall/Court
...,...,...
18221,5730080.0,West Sacramento School for Independent Study
18258,5738703.0,Woodland Adult Education
18279,5790019.0,Yolo County ROP
18285,141440.0,Yuba County Adult Education


#### Coalesce every district into a single dataframe

In [11]:
cde_district_codes = cde[["DistrictCode", "DistrictName"]].drop_duplicates()
cde_district_codes_unique = cde_district_codes[cde_district_codes["DistrictCode"].notnull()]

In [12]:
# Check to see what is missing
pub_district_codes = pub_schools[
    (~pub_schools["DistrictCode"].isin(cde_district_codes_unique["DistrictCode"])) & (pub_schools["StatusType"] == "Active")
][["DistrictCode", "District"]]
pub_district_codes_unique = pub_district_codes[pub_district_codes["DistrictCode"].notnull()].drop_duplicates()
pub_district_codes_unique.columns = ["DistrictCode", "DistrictName"]
pub_district_codes_unique

Unnamed: 0,DistrictCode,DistrictName
722,74005,Tri-Valley ROP
724,74013,Eden Area ROP
726,74021,Mission Valley ROC/P
845,74856,Amador County ROP
1018,74682,Butte County ROP
...,...,...
16814,74591,Sonoma County ROP
17097,74609,Yosemite ROP
17244,74633,Tri-County ROP
18149,74617,Ventura County ROP


In [13]:
district_codes = pd.concat([cde_district_codes_unique, pub_district_codes_unique]).drop_duplicates()
district_codes

Unnamed: 0,DistrictCode,DistrictName
189,10017.0,Alameda County Office of Education
556,31609.0,California School for the Blind (State Special...
619,31617.0,California School for the Deaf-Fremont (State ...
694,61119.0,Alameda Unified
1176,61127.0,Albany City Unified
...,...,...
16814,74591.0,Sonoma County ROP
17097,74609.0,Yosemite ROP
17244,74633.0,Tri-County ROP
18149,74617.0,Ventura County ROP


#### Coalesce every county into a single dataframe

In [14]:
cde_county_codes = cde[["CountyCode", "CountyName"]].drop_duplicates()
cde_county_codes_unique = cde_county_codes[cde_county_codes["CountyCode"].notnull()]

In [15]:
# Check to see what is missing
pub_county_codes = pub_schools[~pub_schools["CountyCode"].isin(cde_county_codes_unique["CountyCode"])][["CountyCode", "County"]]
pub_county_codes_unique = pub_county_codes[pub_county_codes["CountyCode"].notnull()].drop_duplicates()
pub_county_codes_unique.columns = ["CountyCode", "CountyName"]
pub_county_codes_unique

Unnamed: 0,CountyCode,CountyName


#### Coalescing is apparently not necessary here

In [16]:
county_codes = cde_county_codes_unique

#### Some of the `lea` values in the priorities dataset do not translate directly to `school_codes` or `district_codes`, as such, we need to keep them.

In [17]:
p1[(~p1["lea"].isin(school_codes["SchoolName"])) & (p1["SchoolCode"] != 0)]

Unnamed: 0,cdsCode,lea,priorityNumber,numMisassignments,numMaterials,numFacilities,countyPerformance,additionalInfo,meetingDate,year,CountyCode,DistrictCode,SchoolCode
476,19101990135582,Westbrook Academy,1,,0,0.0,Met,By the end of the 2023-24 school year the scho...,2024-06-13,2024,19,10199,135582
621,19647330117846,Para Los Niños Middle,1,,,,Not Met,,,2024,19,64733,117846
651,19647330122630,Para Los Niños - Evelyn Thurman Gratts Primary,1,,,,Not Met,,,2024,19,64733,122630
824,19647336120489,Para Los Niños Charter,1,,0,0.0,Not Met,,,2024,19,64733,6120489
917,20652430107938,Liberty Charter,1,,0/0%,0.0,Met,"At Liberty Charter, we firmly believe that our...",2024-06-13,2024,20,65243,107938
965,23656150140814,Shanél Valley Academy,1,,0/0%,0.0,Met,,2024-06-27,2024,23,65615,140814
1079,30103060134239,Epic California Academy,1,,0,0.0,Met,,2024-06-06,2024,30,10306,134239
1099,30664640106765,California Online Public Schools Southern Cali...,1,,0,0.0,Met,,2024-06-04,2024,30,66464,106765
1432,37681060137034,Altus Schools North County,1,,0,0.0,Met,,2024-06-26,2024,37,68106,137034
1494,37683383730959,Altus Schools Charter School of San Diego,1,,0,0.0,Met,,2024-06-26,2024,37,68338,3730959


#### Verify if all `meetingDate`s are equivalent within schools

In [18]:
priorities = [
    p2[["cdsCode", "meetingDate"]],
    p3[["cdsCode", "meetingDate"]], 
    p6[["cdsCode", "meetingDate"]],
    p7[["cdsCode", "meetingDate"]],
    p9[["cdsCode", "meetingDate"]],
    p10[["cdsCode", "meetingDate"]]
]

priority_codes = p1[["cdsCode", "meetingDate"]]
i = 0
for p in priorities:
    priority_codes = pd.merge(priority_codes, p, how="outer", on="cdsCode", suffixes=(f"_x{i}", f"_y{i}"))
    i += 1

In [19]:
relevant_columns = priority_codes.columns.to_list()
relevant_columns.remove("cdsCode")
def is_equal(row) -> bool:
    for i in relevant_columns:
        for ii in relevant_columns:
            if pd.isna(row[i]) or pd.isna(row[ii]):
                continue
            if row[i] != row[ii]:
                print(f"{row[i]} != {row[ii]}")
                return False
    return True

In [20]:
priority_codes["equal"] = priority_codes.apply(lambda x: is_equal(x), axis=1)

In [21]:
priority_codes[priority_codes["equal"] != True]

Unnamed: 0,cdsCode,meetingDate_x0,meetingDate_y0,meetingDate_x2,meetingDate_y2,meetingDate_x4,meetingDate_y4,meetingDate,equal


They are! We can use a foreign key lookup utilizing `cdsCode` in a `MeetingDates` table.

In [22]:
priorities = [
    p1[["cdsCode", "meetingDate"]],
    p2[["cdsCode", "meetingDate"]],
    p3[["cdsCode", "meetingDate"]], 
    p6[["cdsCode", "meetingDate"]],
    p7[["cdsCode", "meetingDate"]],
    p9[["cdsCode", "meetingDate"]],
    p10[["cdsCode", "meetingDate"]]
]
meeting_dates = pd.concat([i for i in priorities]).drop_duplicates()
meeting_dates = meeting_dates[meeting_dates["cdsCode"].notnull()]

#### Something to note here is that every priority table has these columns which are actually just useless.

In [23]:
print(p1[["priorityNumber", "year"]])

      priorityNumber  year
0                  1  2024
1                  1  2024
2                  1  2024
3                  1  2024
4                  1  2024
...              ...   ...
2257               1  2024
2258               1  2024
2259               1  2024
2260               1  2024
2261               1  2024

[2262 rows x 2 columns]


#### I can't see a reason to keep both `StreetAbr` and `Street` given this; drop the abbreviated version.

In [24]:
print(pub_schools[["StreetAbr", "Street"]].drop_duplicates())
print(pub_schools[["MailStreetAbr", "MailStreet"]].drop_duplicates())

                           StreetAbr                            Street
0               313 West Winton Ave.            313 West Winton Avenue
1      39899 Balentine Dr., Ste. 335  39899 Balentine Drive, Suite 335
2                   1515 Webster St.               1515 Webster Street
3                2125 Jefferson Ave.             2125 Jefferson Avenue
4           2111 International Blvd.      2111 International Boulevard
...                              ...                               ...
18351                 456 Beale Hwy.                 456 Beale Highway
18352                 123 Beale Hwy.                 123 Beale Highway
18353                 111 Hooper St.                 111 Hooper Street
18355             711 West Olive St.             711 West Olive Street
18357             1010 Wheatland Rd.               1010 Wheatland Road

[13994 rows x 2 columns]
                       MailStreetAbr                        MailStreet
0               313 West Winton Ave.            313

#### There are no non-null `numMisassignments`, as such, we can drop them!

In [25]:
p1[p1["numMisassignments"].notnull()]

Unnamed: 0,cdsCode,lea,priorityNumber,numMisassignments,numMaterials,numFacilities,countyPerformance,additionalInfo,meetingDate,year,CountyCode,DistrictCode,SchoolCode


## Upload to database

In [26]:
con = sqlite3.connect(DATABASE)
cur = con.cursor()

#### Drop all columns that are no longer necessary.

In [28]:
p_1 = p1[p1.columns.difference(["priorityNumber", "numMisassignments", "meetingDate", "year"])]
p_2 = p2[p2.columns.difference(["priorityNumber", "meetingDate", "year"])]
p_3 = p3[p3.columns.difference(["priorityNumber", "meetingDate", "year"])]
p_6 = p6[p6.columns.difference(["priorityNumber", "summary", "meetingDate", "year"])]
p_7 = p7[p7.columns.difference(["priorityNumber", "meetingDate", "year"])]
p_9 = p9[p9.columns.difference(["priorityNumber", "meetingDate", "year"])]
p_10 = p10[p10.columns.difference(["priorityNumber", "meetingDate", "year"])]

cde_dropped = cde[cde.columns.difference(["CountyName", "DistrictName", "SchoolName"])]
pub_schools_dropped = pub_schools[pub_schools.columns.difference([
    "County", "District", "School", "StreetAbr", "State", "MailStreetAbr", "MailState"
])]

In [29]:
if DB_INSERT: # Don't need to run this again!
    district_codes.to_sql("Districts", con, index=False, if_exists="append")
    school_codes.to_sql("Schools", con, index=False, if_exists="append")
    county_codes.to_sql("Counties", con, index=False, if_exists="append")
    meeting_dates.to_sql("MeetingDates", con, index=False, if_exists="append")

In [30]:
if DB_INSERT:
    p_1.to_sql("PriorityOne", con, index=False, if_exists="append")
    p_2.to_sql("PriorityTwo", con, index=False, if_exists="append")
    p_3.to_sql("PriorityThree", con, index=False, if_exists="append")
    p_6.to_sql("PrioritySix", con, index=False, if_exists="append")
    p_7.to_sql("PrioritySeven", con, index=False, if_exists="append")
    p_9.to_sql("PriorityNine", con, index=False, if_exists="append")
    p_10.to_sql("PriorityTen", con, index=False, if_exists="append")

In [31]:
if DB_INSERT:
    cde_dropped.to_sql("CensusDay", con, index=False, if_exists="append")
    pub_schools_dropped.to_sql("PublicSchools", con, index=False, if_exists="append")

In [32]:
con.close()