In [None]:
import sqlite3
from pathlib import Path

__file__ = %pwd
DATA = Path(__file__).parent.parent / "data"
PRIORITIES = DATA / "Priorities"
DATABASE = DATA / "databases" / "exploration.db"

import sys
sys.path.append(str(DATA.parent))  # Add the parent directory of 'modeling' to the Python path

con = sqlite3.connect(DATABASE)
cur = con.cursor()

In [None]:
import pandas as pd
import numpy as np


def filter_academic_year(df, year):
    """Filter dataframe by academic year and drop AcademicYear column"""
    filtered_df = df[df["AcademicYear"] == year].copy()
    return filtered_df.drop(columns=["AcademicYear"])


def standarized_columns(acgr_df, mapping):
    """Rename ACGR columns to match Census Day naming convention"""
    renamed_df = acgr_df.copy()
    new_columns = {}

    for col in acgr_df.columns:
        parts = col.split(".")
        if len(parts) == 2 and parts[1] in mapping:
            new_col = f"{parts[0]}.{mapping[parts[1]]}"
            new_columns[col] = new_col

    return renamed_df.rename(columns=new_columns)


census_to_acgr_mapping = {
    # Race/Ethnicity
    "RE_B": "RB",  # African American
    "RE_I": "RI",  # American Indian or Alaska Native
    "RE_A": "RA",  # Asian
    "RE_F": "RF",  # Filipino
    "RE_H": "RH",  # Hispanic or Latino
    "RE_D": "RD",  # Not Reported
    "RE_P": "RP",  # Pacific Islander
    "RE_T": "RT",  # Two or More Races
    "RE_W": "RW",  # White
    # Gender
    "GN_M": "GM",  # Male
    "GN_F": "GF",  # Female
    "GN_X": "GX",  # Non-Binary
    "GN_Z": "GZ",  # Missing Gender
    # Student Groups
    "SG_EL": "SE",  # English Learners
    "SG_DS": "SD",  # Students with Disabilities
    "SG_SD": "SS",  # Socioeconomically Disadvantaged
    "SG_MG": "SM",  # Migrant
    "SG_FS": "SF",  # Foster
    "SG_HM": "SH",  # Homeless
    # Total
    "TA": "TA",  # Total
}

In [None]:
from functools import reduce

# Get tables
census_day = pd.read_sql_query("SELECT * FROM CensusDayWide", con)
acgr = pd.read_sql_query("SELECT * FROM ACGRWide", con)
public_schools = pd.read_sql_query("SELECT * FROM PublicSchools", con)

pattern = "Virtual|Magnet|YearRound|Multilingual"
public_schools = public_schools.filter(regex=pattern)
census_day_standardized = standarized_columns(census_day, census_to_acgr_mapping)


def add_suffix_except(df, suffix, exclude=("SchoolCode", "AcademicYear")):
    return df.rename(
        columns={col: f"{col}{suffix}" for col in df.columns if col not in exclude}
    )


# Reduce columns
census_day_reduced = census_day.drop(
    columns=["AggregateLevel", "DistrictCode", "CountyCode"], errors="ignore"
)
acgr_reduced = acgr.drop(columns=["DistrictCode", "CountyCode"], errors="ignore")

# Merge DataFrames
dfs = [
    census_day_reduced,
    acgr_reduced,
]

merged = reduce(
    lambda left, right: pd.merge(left, right, on="SchoolCode", how="inner"), dfs
)
merged

In [None]:
for col in merged.columns:
    if any(substr in col for substr in ["Rate", "Percent", "ENR", "GR_"]):
        merged[col] = pd.to_numeric(merged[col], errors="coerce")

academic_years = [
    "2017-18",
    "2018-19",
    "2019-20",
    "2020-21",
    "2021-22",
    "2022-23",
    "2023-24",
]

columns_to_drop = [
    col
    for col in merged.columns
    if col.startswith("RegHSDiploma")
    or col.startswith("Dropout")
    or col.startswith("UniReqs")
    or col.startswith("Other")
    or col.startswith("GR_")
]

year_df_mapping = {year: filter_academic_year(merged, year) for year in academic_years}

## XGBoost


### Grad Rate


In [None]:
from modeling.utils.xgboost import cross_validate_xgboost, plot_cv_feature_importance
from modeling.utils.xgboost import (
    plot_top_k_features,
    plot_feature_avg_variance,
    plot_feature_time_series,
)

cv_xgb_models = {}

target_column = "RegHSDiplomaRate.TA"

for year, year_df in year_df_mapping.items():

    cv_result = cross_validate_xgboost(
        df=year_df,
        target_column=target_column,
        columns_to_drop=[target_column] + columns_to_drop,
        n_splits=5,
        print_results=True,
        n_jobs=-1,
    )

    cv_xgb_models[year] = cv_result

In [None]:
top_features = plot_cv_feature_importance(cv_xgb_models, top_n=5)
plot_top_k_features(cv_xgb_models, "XGBoost", "gain", target_column, k=5)
plot_feature_avg_variance(cv_xgb_models, "XGBoost", "gain", target_column)
plot_feature_time_series(cv_xgb_models, "XGBoost", "gain", target_column, top_features)

### College Readiness


In [None]:
from modeling.utils.xgboost import cross_validate_xgboost, plot_cv_feature_importance
from modeling.utils.xgboost import (
    plot_top_k_features,
    plot_feature_avg_variance,
    plot_feature_time_series,
)

cv_xgb_models = {}

target_column = "UniReqsPercent.TA"

for year, year_df in year_df_mapping.items():

    cv_result = cross_validate_xgboost(
        df=year_df,
        target_column=target_column,
        columns_to_drop=[target_column] + columns_to_drop,
        n_splits=5,
        print_results=True,
        n_jobs=-1,
    )

    cv_xgb_models[year] = cv_result

In [None]:
top_features = plot_cv_feature_importance(cv_xgb_models, top_n=5)
plot_top_k_features(cv_xgb_models, "XGBoost", "gain", target_column, k=5)
plot_feature_avg_variance(cv_xgb_models, "XGBoost", "gain", target_column)
plot_feature_time_series(cv_xgb_models, "XGBoost", "gain", target_column, top_features)

## Random Forest


### Grad Rate


In [None]:
from modeling.utils.randomforest import (
    cross_validate_random_forest,
    plot_cv_feature_importance_rf,
    plot_top_k_features_rf,
    plot_feature_avg_variance_rf,
    plot_feature_time_series_rf,
)

cv_models = {}

target_variable = "RegHSDiplomaRate.TA"
years = list(year_df_mapping.keys())

for year in years:
    df = year_df_mapping[year]

    cv_result = cross_validate_random_forest(
        df=df,
        target_column=target_variable,
        columns_to_drop=columns_to_drop,
        n_splits=5,
        print_results=True,
        n_jobs=-1,
    )

    cv_models[year] = cv_result

In [None]:
all_top_features = plot_cv_feature_importance_rf(cv_models, top_n=5)

plot_top_k_features_rf(cv_models, target_variable=target_variable, k=5)
plot_feature_avg_variance_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)
plot_feature_time_series_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)

### College Readiness


In [None]:
from modeling.utils.randomforest import (
    train_random_forest_model,
    plot_cv_feature_importance_rf,
    plot_top_k_features_rf,
    plot_feature_avg_variance_rf,
    plot_feature_time_series_rf,
)

cv_models = {}

target_variable = "UniReqsPercent.TA"
years = list(year_df_mapping.keys())

for year in years:
    df = year_df_mapping[year]

    cv_result = train_random_forest_model(
        df=df,
        target_column=target_variable,
        columns_to_drop=columns_to_drop,
        n_splits=5,
        print_results=True,
        n_jobs=-1,
    )

    cv_models[year] = cv_result

In [None]:
all_top_features = plot_cv_feature_importance_rf(cv_models, top_n=5)

plot_top_k_features_rf(cv_models, target_variable=target_variable, k=5)
plot_feature_avg_variance_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)
plot_feature_time_series_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)

### Exit


In [None]:
con.close()