In [None]:
# %% [markdown]
# # Model Development Pipeline for Behavioral Disruption Prediction

# %%
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_squared_error, r2_score

# %% [markdown]
# ## Data Preparation (Using provided preprocessed dataset)

# %%
# Load the preprocessed model-ready dataset
student_weeks = pd.read_csv("model_ready_student_weeks.csv")

# Drop rows with missing target
student_weeks.dropna(subset=['referral_next_week'], inplace=True)

# Define features and target
features = ['weekly_referrals', 'weekly_bus_incidents', 'Grade_Level', 'Gender',
            'Ethnicity', 'LunchStatus', 'temp', 'humidity', 'precip', 'sealevelpressure', 'windgust']

X = student_weeks[features]
y_classification = student_weeks['referral_next_week']
y_regression = student_weeks['weekly_referrals']  # Predicting severity/frequency

# Train-test split
X_train, X_test, y_clf_train, y_clf_test, y_reg_train, y_reg_test = train_test_split(
    X, y_classification, y_regression, test_size=0.2, random_state=42)

# %% [markdown]
# ## Preprocessing Pipeline

# %%
# Numeric and categorical preprocessing
numeric_features = ['weekly_referrals', 'weekly_bus_incidents', 'Grade_Level', 'temp', 'humidity', 'precip', 'sealevelpressure', 'windgust']
categorical_features = ['Gender', 'Ethnicity', 'LunchStatus']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

# %% [markdown]
# ## Logistic Regression (Classification)

# %%
clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LogisticRegression(random_state=42))])

clf_pipeline.fit(X_train, y_clf_train)
y_clf_pred = clf_pipeline.predict(X_test)

print("Classification Report:\n", classification_report(y_clf_test, y_clf_pred))

# %% [markdown]
# ## Linear Regression (Regression)

# %%
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', LinearRegression())])

reg_pipeline.fit(X_train, y_reg_train)
y_reg_pred = reg_pipeline.predict(X_test)

print("Linear Regression RMSE:", mean_squared_error(y_reg_test, y_reg_pred, squared=False))
print("Linear Regression R²:", r2_score(y_reg_test, y_reg_pred))

# %% [markdown]
# ## Advanced Model - Random Forest (Classification)

# %%
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))])

rf_pipeline.fit(X_train, y_clf_train)
y_rf_pred = rf_pipeline.predict(X_test)

print("Random Forest Classification Report:\n", classification_report(y_clf_test, y_rf_pred))
