## This notebook prepares the demographic data for the modeling phase of the analysis.
Specifically, it:

1. Loads in merged datasets from 
2. Identifies and organizes relevant demographic columns into defined buckets.
3. Provides a standardized function to extract these columns from the desired year's dataset.
---

In [26]:
import pandas as pd

1. Get merged datasets and the column key df

In [76]:
def load_data_from_github(year):
    """
    Loads district-level education data and corresponding column key from the HERC GitHub repository.

    Parameters:
    -----------
    year : int or str
        The year for which to load the data (e.g., 2020).

    Returns:
    --------
    df : pandas.DataFrame
        The cleaned district-level dataset for the specified year. Charter schools are filtered out,
        and negative values in numeric columns are replaced with NaN.

    column_key : pandas.DataFrame
        The column key DataFrame from the 'distprof' sheet in the corresponding Excel file, 
        used for understanding column meanings in `df`.

    Notes:
    ------
    - The data is sourced from the RiceD2KLab/HERC_Sp25/ GitHub repository.
    - Assumes the structure of files follows the naming convention: 
      'merged_<year>.csv' and 'TAPR_district_adv_<year>.xlsx' located at:
      https://github.com/RiceD2KLab/HERC_Sp25/tree/main/5_Dashboard_Development/data /<year>
    """
    base_url = f"https://raw.githubusercontent.com/RiceD2KLab/HERC_Sp25/refs/heads/main/5_Dashboard_Development/data/{year}"
    csv_url = f"{base_url}/merged_{year}.csv"
    key_url = f"{base_url}/column_key_{year}.csv"
    print(csv_url)
    try:
        df = pd.read_csv(csv_url)
        print(df)
        column_key = pd.read_csv(key_url)
    except Exception as e:
        print(f"Data for the year {year} does not exist or cannot be accessed")
        return None, None

    if 'Charter School (Y/N)' in df.columns:
        df = df[df['Charter School (Y/N)'] == 'N']

    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].mask(df[numeric_cols] < 0, pd.NA)

    return df, column_key


df_2024, key_2024 = load_data_from_github(2024)

https://raw.githubusercontent.com/RiceD2KLab/HERC_Sp25/refs/heads/main/5_Dashboard_Development/data/2024/merged_2024.csv
      DISTRICT_id           DISTNAME  \
0            1902         CAYUGA ISD   
1            1903        ELKHART ISD   
2            1904      FRANKSTON ISD   
3            1906         NECHES ISD   
4            1907      PALESTINE ISD   
...           ...                ...   
1202       252902      NEWCASTLE ISD   
1203       252903          OLNEY ISD   
1204       253901  ZAPATA COUNTY ISD   
1205       254901   CRYSTAL CITY ISD   
1206       254902       LA PRYOR ISD   

      District 2023 Graduates: All Students Count (excluding FHSP/Texas First-DLA)  \
0                                                   NaN                              
1                                                   NaN                              
2                                                   NaN                              
3                                                   Na

In [77]:
df24, key24 = load_data_from_github(2024)
df24.head(5)
key24.head(5)

https://raw.githubusercontent.com/RiceD2KLab/HERC_Sp25/refs/heads/main/5_Dashboard_Development/data/2024/merged_2024.csv
      DISTRICT_id           DISTNAME  \
0            1902         CAYUGA ISD   
1            1903        ELKHART ISD   
2            1904      FRANKSTON ISD   
3            1906         NECHES ISD   
4            1907      PALESTINE ISD   
...           ...                ...   
1202       252902      NEWCASTLE ISD   
1203       252903          OLNEY ISD   
1204       253901  ZAPATA COUNTY ISD   
1205       254901   CRYSTAL CITY ISD   
1206       254902       LA PRYOR ISD   

      District 2023 Graduates: All Students Count (excluding FHSP/Texas First-DLA)  \
0                                                   NaN                              
1                                                   NaN                              
2                                                   NaN                              
3                                                   Na

Unnamed: 0.1,Unnamed: 0,NAME,LABEL
0,0,DISTRICT,District Number
1,1,DPETGEEC,District 2024 Student Membership: EE Count
2,2,DPETGPKC,District 2024 Student Membership: PK Count
3,3,DPETGKNC,District 2024 Student Membership: KG Count
4,4,DPETG01C,District 2024 Student Membership: 01 Count


2. Identifying the relevant demographic columns and placing it into defined buckets.


In [78]:
demographic_buckets = {'student_teacher_ratio': ['DPSTKIDR'],
 'student_count': ['DPNTALLC'],
 'staff_count': ['DPSATOFC'],
 'race_ethnicity_percent': ['DPNTBLAP',
  'DPNTINDP',
  'DPNTASIP',
  'DPNTHISP',
  'DPNTPCIP',
  'DPNTTWOP',
  'DPNTWHIP'],
 'economically_disadvantaged': ['DPNTECOP', 'DPNTTT1P'],
 'special_ed_504': ['DPNT504P', 'DPNTSPEP'],
 'language_education_percent': ['DPNTBILP', 'DPNTLEPP'],
 'special_populations_percent': ['DPNTFOSP',
  'DPNTHOMP',
  'DPNTIMMP',
  'DPNTMIGP',
  'DPNTMLCP'],
 'gifted_students': ['DPNTGIFP'],
 'district_identifiers': ['DISTRICT_id',
  'TEA District Type',
  'TEA Description',
  'NCES District Type',
  'NCES Description',
  'Charter School (Y/N)',
  'COUNTY',
  'REGION',
  'DISTRICT',
  'DISTNAME',
  'CNTYNAME',
  'DFLCHART',
  'DFLALTED',
  'ASVAB_STATUS']}

3. Get real column names using key df.

In [None]:
def get_labels_from_variable_name_dict(name_dict, key_df):
    """
    Given a dictionary of COLUMN ID values, return a dictionary mapping each key to a list of COLUMN LABEL Values
    from the key DataFrame. For the 'district_identifiers' key, include its values without modification.

    Args:
        name_dict (dict): Dictionary with string keys and list of COLUMN IDs as values.
        key_df (pd.DataFrame): DataFrame with 'NAME' and 'LABEL' columns. The NAME LABEL mapping file

    Returns:
        dict: Dictionary with the same keys and list of corresponding LABELs as values.
    """
    result = {}
    for key, name_list in name_dict.items():
        if key == "district_identifiers":
            # Leave district identifiers untouched
            result[key] = name_list
        else:
            # Map NAMEs to LABELs using the key DataFrame
            result[key] = key_df[key_df['NAME'].isin(name_list)]['LABEL'].tolist()
    return result


In [81]:
demo_bucket_names = get_labels_from_variable_name_dict(demographic_buckets, key24)
demo_bucket_names

{'student_teacher_ratio': ['District 2024 Staff: Teacher Student Ratio'],
 'student_count': ['District 2024 Student Enrollment: All Students Count'],
 'staff_count': ['District 2024 Staff: All Staff Total Full Time Equiv Count'],
 'race_ethnicity_percent': ['District 2024 Student Enrollment: African American Percent',
  'District 2024 Student Enrollment: Hispanic Percent',
  'District 2024 Student Enrollment: White Percent',
  'District 2024 Student Enrollment: American Indian Percent',
  'District 2024 Student Enrollment: Asian Percent',
  'District 2024 Student Enrollment: Pacific Islander Percent',
  'District 2024 Student Enrollment: Two or More Races Percent'],
 'economically_disadvantaged': ['District 2024 Student Enrollment: Econ Disadv Percent',
  'District 2024 Student Enrollment: Title I Percent'],
 'special_ed_504': ['District 2024 Student Enrollment: Section 504 Percent',
  'District 2024 Student Enrollment: Special Ed Percent'],
 'language_education_percent': ['District 20