In [1]:
import sqlite3
from pathlib import Path

__file__ = %pwd
DATA = Path(__file__).parent.parent / "data"
PRIORITIES = DATA / "Priorities"
DATABASE = DATA / "databases" / "exploration.db"

con = sqlite3.connect(DATABASE)
cur = con.cursor()

import sys
sys.path.append(str(DATA.parent))  # Add the parent directory of 'modeling' to the Python path

In [2]:
import pandas as pd
import numpy as np

census_day = pd.read_sql_query("SELECT * FROM CensusDay", con)
acgr = pd.read_sql_query("SELECT * FROM ACGR", con)

# Join on SchoolCode and AcademicYear/Year
merged = pd.merge(
    census_day, 
    acgr,
    left_on=['SchoolCode', 'AcademicYear'],
    right_on=['SchoolCode', 'Year'],
    how='inner'
)

: 

In [None]:

import modeling.utils.lasso as lasso_utils

target_column = "RegHSDiplomaRate.TA"
columns_to_drop = [col for col in merged.columns if col.startswith("UniReqs") or col.startswith("RegHSDiploma") or col.startswith("Dropout")]

X_numeric, y, sorted_pairs = lasso_utils.fit_lasso_model(
    merged, target_column, columns_to_drop
)
lasso_utils.plot_lasso_coefficients(target_column, sorted_pairs)
lasso_model = lasso_utils.LassoCV(cv=5, random_state=42, max_iter=10000)
lasso_results = lasso_utils.train_and_evaluate_lasso_model(X_numeric, y, lasso_model)
print(f"R^2 Score: {lasso_results['r2']}")
print(f"Mean Squared Error: {lasso_results['mse']}")
print(f"Root Mean Squared Error: {lasso_results['rmse']}")
lasso_utils.plot_lasso_residuals(
    lasso_results["y_test"], lasso_results["y_pred"], target_column
)
lasso_utils.plot_actual_predicted_lasso(
    lasso_results["y_test"], lasso_results["y_pred"], target_column
)