In [None]:
import sqlite3
from pathlib import Path

__file__ = %pwd
DATA = Path(__file__).parent.parent / "data"
PRIORITIES = DATA / "Priorities"
DATABASE = DATA / "databases" / "exploration.db"

import sys
sys.path.append(str(DATA.parent))  # Add the parent directory of 'modeling' to the Python path

con = sqlite3.connect(DATABASE)
cur = con.cursor()

In [None]:
import pandas as pd
import numpy as np

df_2016_17 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2016-17' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)
df_2017_18 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2017-18' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)
df_2018_19 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2018-19' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)
df_2019_20 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2019-20' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)
df_2020_21 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2020-21' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)
df_2021_22 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2021-22' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)
df_2022_23 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2022-23' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)
df_2023_24 = pd.read_sql_query(
    "SELECT * FROM ACGR WHERE Year = '2023-24' and CharterSchool = 'No ' and DASS = 'No '",
    con,
)

year_df_mapping = {
    2016: df_2016_17,
    2017: df_2017_18,
    2018: df_2018_19,
    2019: df_2019_20,
    2020: df_2020_21,
    2021: df_2021_22,
    2022: df_2022_23,
    2023: df_2023_24,
}
grad_rate_columns_to_drop = [
    col
    for col in df_2016_17.columns
    if col.startswith("Dropout") or col.startswith("RegHSDiploma")
]
csu_rate_columns_to_drop = [
    col for col in df_2016_17.columns if col.startswith("UniReqs")
]

## XGBoost


### Cross-Validated Grad Rate


In [None]:
from modeling.utils.xgboost import cross_validate_xgboost, plot_cv_feature_importance
from modeling.utils.xgboost import (
    plot_top_k_features,
    plot_feature_avg_variance,
    plot_feature_time_series,
)

cv_xgb_models = {}

target_column = "RegHSDiplomaRate"

for year, year_df in year_df_mapping.items():

    year_df.replace("*", np.nan, inplace=True)
    year_df["Year"] = year_df["Year"].astype(str).str.split("-").str[0]
    year_df = year_df.apply(pd.to_numeric, errors="coerce")
    year_df[target_column] = pd.to_numeric(year_df[target_column], errors="coerce")

    cv_result = cross_validate_xgboost(
        df=year_df,
        target_column=target_column,
        columns_to_drop=[target_column] + grad_rate_columns_to_drop,
        n_splits=5,
        print_results=True,
        n_jobs=-1,
    )

    cv_xgb_models[year] = cv_result

top_features = plot_cv_feature_importance(cv_xgb_models, top_n=7)

In [None]:
plot_top_k_features(cv_xgb_models, "XGBoost", "gain", target_column, k=7)
plot_feature_avg_variance(cv_xgb_models, "XGBoost", "gain", target_column, k=7)
plot_feature_time_series(cv_xgb_models, "XGBoost", "gain", target_column, top_features)

### Cross-Validated CSU/UC Readiness Rate


In [None]:
from modeling.utils.xgboost import cross_validate_xgboost, plot_cv_feature_importance
from modeling.utils.xgboost import (
    plot_top_k_features,
    plot_feature_avg_variance,
    plot_feature_time_series,
)

cv_xgb_models = {}

target_column = "UniReqsPercent"


for year, year_df in year_df_mapping.items():

    year_df.replace("*", np.nan, inplace=True)
    year_df["Year"] = year_df["Year"].astype(str).str.split("-").str[0]
    year_df = year_df.apply(pd.to_numeric, errors="coerce")
    year_df[target_column] = pd.to_numeric(year_df[target_column], errors="coerce")

    cv_result = cross_validate_xgboost(
        df=year_df,
        target_column=target_column,
        columns_to_drop=csu_rate_columns_to_drop,
        n_splits=5,
        print_results=True,
        n_jobs=-1,
    )

    cv_xgb_models[year] = cv_result

In [None]:
top_features = plot_cv_feature_importance(cv_xgb_models, top_n=5)
plot_feature_avg_variance(cv_xgb_models, "XGBoost", "gain", target_column)
plot_feature_time_series(cv_xgb_models, "XGBoost", "gain", target_column, top_features)

## Random Forest


### Cross Validated Grad Rate


In [None]:
from modeling.utils.randomforest import (
    cross_validate_random_forest,
    plot_cv_feature_importance_rf,
    plot_top_k_features_rf,
    plot_feature_avg_variance_rf,
    plot_feature_time_series_rf,
)

cv_models = {}

target_variable = "RegHSDiplomaRate"

for year, year_df in year_df_mapping.items():

    year_df.replace("*", np.nan, inplace=True)
    year_df["Year"] = year_df["Year"].astype(str).str.split("-").str[0]
    year_df = year_df.apply(pd.to_numeric, errors="coerce")
    year_df[target_column] = pd.to_numeric(year_df[target_column], errors="coerce")

    cv_result = cross_validate_random_forest(
        df=year_df,
        target_column=target_variable,
        columns_to_drop=grad_rate_columns_to_drop,
        n_splits=5,
        print_results=True,
        n_jobs=8,
    )

    cv_models[year] = cv_result

In [None]:
all_top_features = plot_cv_feature_importance_rf(cv_models, top_n=5)


plot_top_k_features_rf(cv_models, target_variable=target_variable, k=5)
plot_feature_avg_variance_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)
plot_feature_time_series_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)

### Cross Validated CSU/UC Readiness Rate


In [None]:
from modeling.utils.randomforest import (
    cross_validate_random_forest,
    plot_cv_feature_importance_rf,
    plot_top_k_features_rf,
    plot_feature_avg_variance_rf,
    plot_feature_time_series_rf,
)

cv_models = {}

target_variable = "UniReqsPercent"

for year, year_df in year_df_mapping.items():

    year_df.replace("*", np.nan, inplace=True)
    year_df["Year"] = year_df["Year"].astype(str).str.split("-").str[0]
    year_df = year_df.apply(pd.to_numeric, errors="coerce")
    year_df[target_column] = pd.to_numeric(year_df[target_column], errors="coerce")

    cv_result = cross_validate_random_forest(
        df=year_df,
        target_column=target_variable,
        columns_to_drop=csu_rate_columns_to_drop,
        n_splits=5,
        print_results=False,
    )

    cv_models[year] = cv_result

In [None]:
all_top_features = plot_cv_feature_importance_rf(cv_models, top_n=5)
plot_top_k_features_rf(cv_models, target_variable=target_variable, k=5)
plot_feature_avg_variance_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)
plot_feature_time_series_rf(
    cv_models, target_variable=target_variable, top_k_features=all_top_features
)

In [None]:
con.close()