In [1]:
# this code trains on all out of school students and provides code for production and results in risk profiles for all students
# it outputs the numbers of students in each risk category for repeater risk and credit risk who are still in school

In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

from scipy.stats import spearmanr

from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

In [3]:
# load the df - just be careful here as i had combined_data.csv in a different folder
data = pd.read_csv("final_data/combined_data.csv") 


## Process the data

In [5]:
# drop some columns we just won't use

def repeats_credits_data_processing(data):
    data.drop(columns="MaskedStudentPersonKey", inplace=True) # drops the student ID for the second time

    data.drop(columns=['total_present', 'total_absent', 
                       'total_days', 'total_tardy', 'chronic_absent_10pct', # drop attendance data this is biasing the data
                      #'graduated?', # drop graduated used for training/testing
                      'test_score_2023', 'test_score_2024', 'test_score_2025', 'test_score_2026', # missing anyways
                      'GradeLevel', # dropped bc leaky - indicates most recent level we have data for - which doesnt feel useful
                      'present_pct' # just so much missing here
                      ], inplace=True, errors='ignore')  



    # combine the discipline data and drop the individual discpline cols
    # combine discipline columns by taking the mean
    discipline_cols = ['num_discipline_2023', 'num_discipline_2024',
                       'num_discipline_2025', 'num_discipline_2026']

    # set all discipline not explicitly given 0 instead of NaN - the data supports this as only students with discipline events have any numbers - students
    # without discipline incidents have NaNS all the way across
    data[discipline_cols] = data[discipline_cols].fillna(0)

    data['mean_discipline'] = data[discipline_cols].mean(axis=1, skipna=True)

    # drop the individual discipline columns
    data = data.drop(columns=discipline_cols)

    # get rid of MS columns
    # drop all columns that start with 'MS_'
    cols_to_drop = [col for col in data.columns if col.startswith('MS_')]
    data_clean = data.drop(columns=cols_to_drop)
    
    # students no longer in school
    out_of_school_df = data_clean[(data_clean['DropOut'] == 1) | (data_clean['graduated?'] == 1)].reset_index(drop=True)

    # students still in school 
    in_school_df = data_clean[(data_clean['DropOut'] != 1) & (data_clean['graduated?'] != 1)]
    
    return out_of_school_df, in_school_df

repeats_credits_outschool_df, repeats_credits_inschool_df = repeats_credits_data_processing(data=data)

In [6]:
# create a dataframe for binned credits

def make_filtered_credits_repeats_df(df):
    # create a new dataframe for binning credits earned - for training/testing later
    binned_credits_earned_df = df.copy()
    binned_credits_earned_df['Not all credits'] = (binned_credits_earned_df['HS_PctEarned'] < 0.8).astype(int) #0.999999
    # display(binned_credits_earned_df)
    
    binned_credits_clean = binned_credits_earned_df.dropna(subset=['mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging',
                                                              ]).reset_index(drop=True)
    
    repeat_clean = df.dropna(subset=['HS_PctEarned', 'mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging']).reset_index(drop=True)
    
    return binned_credits_clean, repeat_clean

binned_credits_clean, repeat_clean = make_filtered_credits_repeats_df(repeats_credits_outschool_df)

## Production

In [8]:
def credits_repeats_train_logistic_regression_full(df, target_col, feature_cols, random_state=42):
    # select features and target
    X = df[feature_cols]
    y = df[target_col]
    
    # scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # on all training data

    # train
    clf = LogisticRegression(max_iter=1000, random_state=random_state, class_weight='balanced')
    clf.fit(X_scaled, y)

    # print("Model trained on all available data.")
    return clf, scaler

def credits_repeats_apply_logistic_model(clf, scaler, new_df, feature_cols):
    # apply the same scaler (don't fit again!)
    X_new_scaled = scaler.transform(new_df[feature_cols])
    
    # predict probabilities
    new_df['pred_proba'] = clf.predict_proba(X_new_scaled)[:, 1]
    new_df['pred_label'] = clf.predict(X_new_scaled)
    
    return new_df

def credits_repeats_assign_risk_tiers_by_percentile(df, prob_col='pred_proba'):
    # sort by probability
    df = df.sort_values(prob_col, ascending=True).reset_index(drop=True)
    
    # define percentile cut points (quintiles)
    quantiles = df[prob_col].quantile([0.2, 0.4, 0.6, 0.8]).values
    
    # assign tiers
    def get_tier(prob):
        if prob <= 0.2:
            return "Low Risk"
        elif prob <= 0.4:
            return "Moderately Low Risk"
        elif prob <= 0.6:
            return "Moderate Risk"
        elif prob <= 0.8:
            return "Moderately High Risk"
        else:
            return "High Risk"
    
    df['risk_tier'] = df[prob_col].apply(get_tier)
    
    # summary: number of students in each tier, sorted Low to High
    tier_order = ["Low Risk", "Moderately Low Risk", "Moderate Risk", 
                  "Moderately High Risk", "High Risk"]
    summary = df['risk_tier'].value_counts().reindex(tier_order).fillna(0).astype(int)
    
    return df, summary

In [9]:
# REPEATER PRODUCTION
# 1. train on full data
repeat_clf, repeat_scaler = credits_repeats_train_logistic_regression_full(
    df=repeat_clean,
    target_col='HS_IsRepeater',
    feature_cols=['HS_PctEarned', 'mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging']
)

# 2. apply to unseen data
repeat_predicted_new_data = credits_repeats_apply_logistic_model(
    repeat_clf,
    repeat_scaler,
    new_df=repeats_credits_inschool_df.dropna(subset=['HS_PctEarned', 'mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging']).reset_index(drop=True),
    feature_cols=['HS_PctEarned', 'mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging']
)

# apply to data
repeat_df_with_tiers, repeat_risk_summary = credits_repeats_assign_risk_tiers_by_percentile(repeat_predicted_new_data, prob_col='pred_proba')
print(repeat_risk_summary)

risk_tier
Low Risk                12107
Moderately Low Risk      3571
Moderate Risk            1602
Moderately High Risk     1221
High Risk                2180
Name: count, dtype: int64


In [10]:
# CREDITS PRODUCTION
# 1. train on full data
credit_clf, credit_scaler = credits_repeats_train_logistic_regression_full(
    df=binned_credits_clean,    
    target_col='Not all credits',
    feature_cols=['mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging']
)

# 2. apply to unseen data
credits_predicted_new_data = credits_repeats_apply_logistic_model(
    credit_clf,
    credit_scaler,
    new_df=repeats_credits_inschool_df.dropna(subset=['mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging',
                ]).reset_index(drop=True),
    feature_cols=['mean_discipline', 'MAP_TestRITScore', 'HS_SenseofBelonging']
)

# apply to data
credit_df_with_tiers, credit_risk_summary = credits_repeats_assign_risk_tiers_by_percentile(credits_predicted_new_data, prob_col='pred_proba')
print(credit_risk_summary)

risk_tier
Low Risk                3316
Moderately Low Risk     6759
Moderate Risk           5101
Moderately High Risk    3755
High Risk               2434
Name: count, dtype: int64
