In [1]:
import sqlite3

import pandas as pd
from pathlib import Path

## Constants

In [2]:
__file__ = %pwd
DATA = Path(__file__).parent / "data"
PRIORITIES = DATA / "Priorities"
DATABASE = DATA / "databases" / "exploration.db"

con = sqlite3.connect(DATABASE)
cur = con.cursor()

## Upload to database

In [3]:
pub_schools_columns = [
    "CDSCode", "NCESDist", "NCESSchool", "StatusType",
    "County", "District", "School", "Street", "StreetAbr",
    "City", "Zip", "State", "MailStreet", "MailStreetAbr",
    "MailCity", "MailZip", "MailState", "Phone", "PhoneExt",
    "FaxNumber", "Website", "OpenDate", "ClosedDate", "Charter",
    "CharterNum", "FundingType", "DOC", "DOCType", "SOC",
    "SOCType", "EdOpsCode", "EdOpsName", "EILCode", "EILName",
    "GSoffered", "GSserved", "Virtual", "Magnet", "YearRound",
    "FederalDFCDistrictID", "Latitude", "Longitude", "AdmFName",
    "AdmLName", "LastUpDate", "Multilingual"
]

In [4]:
p1 = pd.read_excel(PRIORITIES / "Pr12024.xlsx")
p2 = pd.read_excel(PRIORITIES / "Pr22024.xlsx")
p3 = pd.read_excel(PRIORITIES / "Pr32024.xlsx")
p6 = pd.read_excel(PRIORITIES / "Pr62024.xlsx")
p7 = pd.read_excel(PRIORITIES / "Pr72024.xlsx")
p9 = pd.read_excel(PRIORITIES / "Pr92024.xlsx")
p10 = pd.read_excel(PRIORITIES / "Pr102024.xlsx")
pub_schools = pd.read_excel(DATA / "pubschls.xlsx", names=pub_schools_columns, skiprows=5)[:-1] # So many junk rows in this dataset
cde = pd.read_csv(DATA / "cdenroll2324-v2.txt", sep="\t")

#### Created tables with primary key ids to act as a foreign key relation for other tables to map to County, District, and School name.

In [5]:
if False: # Don't need to run this again!
    county_codes = cde[["CountyCode", "CountyName"]].drop_duplicates()
    county_codes.columns = ["county_id", "county_name"] # Not a fan of PascalCase for column-names
    county_codes.to_sql("Counties", con, index=False, if_exists="append")

In [6]:
if False:
    district_codes = cde[["DistrictCode", "DistrictName"]].drop_duplicates()
    district_codes.columns = ["district_id", "district_name"]
    district_codes.to_sql("Districts", con, index=False, if_exists="append")

In [7]:
if False:
    school_codes = cde[["SchoolCode", "SchoolName"]].drop_duplicates()
    school_codes_unique = school_codes[school_codes["SchoolCode"].notnull()]
    school_codes_unique.columns = ["school_id", "school_name"]
    school_codes_unique.to_sql("Schools", con, index=False, if_exists="append")

All datasets besides the `cdenroll2324-v2.txt` dataset use an amalgamated `CDSCode` rather than independent `CountyID`, etc. columns. As such, we should drop the `CDSCode` column in favor of easier to work with columns that can then be used as foreign keys in the `Counties`, `Districts`, and `Schools` tables. As such, we will parse out the relevant values before placing the datasets in their respective tables.

In [8]:
def get_countyCode(cds_code: int) -> int:
    county_digits = 1e12
    return int(cds_code / county_digits)

def get_districtCode(cds_code: int) -> int:
    district_digits = 1e7
    county_digits = 1e12
    return int((cds_code % county_digits) / district_digits)

def get_schoolCode(cds_code: int) -> int:
    district_digits = 1e7
    return int(cds_code % district_digits)

In [9]:
# pub_schools.CDSCode.apply(lambda x: print(f"{int(x)} -> {get_schoolCode(int(x))}"))

### **NOTE(josh): It seems that there are some Schools not accounted for in the `Schools` table (e.g., FAME Public Charter doesn't exist in the table!)**

In [10]:
pub_schools[["CDSCode", "School"]]

Unnamed: 0,CDSCode,School
0,01100170000000,No Data
1,01100170109835,FAME Public Charter
2,01100170112607,Envision Academy for Arts & Technology
3,01100170118489,Aspire California College Preparatory Academy
4,01100170123968,Community School for Creative Education
...,...,...
18359,58727690133751,Edward P. Duplex
18360,58727695830039,Wheatland Continuation
18361,58727695830070,Wheatland Alternative Education
18362,58727695830120,Academy for Career Education Charter


#### Something to note here is that every priority table has these columns which are actually just useless. **Remember to drop them!**

In [11]:
print(p1[["priorityNumber", "year"]])

      priorityNumber  year
0                  1  2024
1                  1  2024
2                  1  2024
3                  1  2024
4                  1  2024
...              ...   ...
2257               1  2024
2258               1  2024
2259               1  2024
2260               1  2024
2261               1  2024

[2262 rows x 2 columns]


In [12]:
con.close()