In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    accuracy_score,
    f1_score,
    confusion_matrix
)
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas() 

RANDOM_STATE = 42

# Target & features setup
TARGET_COL = "holistic_essay_score"
TEXT_COL = "text"
CATEGORICAL_COLS = ['gender', 'grade_level', 'race_ethnicity', 'economically_disadvantaged']

DF_HIGH = "../data/full/data_full_high.csv"
EMB_HIGH = "../embeddings/embeddings_high.npy"

SAVE_DIR = "../model/run_01/high"
SAVE_NAME = "data_high_scored.csv"

In [2]:
df = pd.read_csv(DF_HIGH)
X_emb_metrics = np.load(EMB_HIGH)

In [3]:
df_input = df.copy().drop(columns=['text', 'holistic_essay_score', 'prompt_name'])

In [4]:
ohe_cols = [c for c in df_input.columns if c.startswith("gender") or c.startswith("grade_level") or c.startswith("race_") or c.startswith("economically_disadvantaged")]
taaled_cols = [c for c in df_input.columns if c.startswith("taaled_")]
taaco_cols = [c for c in df_input.columns if c.startswith("taaco_")]
taassc_cols = [c for c in df_input.columns if c.startswith("taassc_")]

In [5]:
y = df['holistic_essay_score']

x1 = np.hstack([df_input, X_emb_metrics]) # full (OHE + Emb + All Style)
x2 = np.hstack([df_input]) # Style (OHE + Style)
x3 = np.hstack([df_input[ohe_cols], X_emb_metrics]) # Embedding (OHE + Emb)
x4 = np.hstack([df_input[ohe_cols], df_input[taaled_cols]]) # TAALED (OHE + TAALED)
x5 = np.hstack([df_input[ohe_cols], df_input[taaco_cols]]) # TAACO (OHE + TAACO)
x6 = np.hstack([df_input[ohe_cols], df_input[taassc_cols]]) # TAASSC (OHE + TAACO)


In [None]:
# Base style features (everything in df_input)
style_cols = list(df_input.columns)

# OHE subset used in emb + all others
ohe_cols_list = list(df_input[ohe_cols].columns)

taaled_feature_cols = ohe_cols_list + list(df_input[taaled_cols].columns)
taaco_feature_cols  = ohe_cols_list + list(df_input[taaco_cols].columns)
taassc_feature_cols = ohe_cols_list + list(df_input[taassc_cols].columns)

feature_meta = {
    "style": style_cols,
    "emb_ohe": ohe_cols_list,
    "taaled": taaled_feature_cols,
    "taaco": taaco_feature_cols,
    "taassc": taassc_feature_cols,
    "emb_dim": int(X_emb_metrics.shape[1]),
}

import json, os
os.makedirs(SAVE_DIR, exist_ok=True)
with open(os.path.join(SAVE_DIR, "feature_meta.json"), "w") as f:
    json.dump(feature_meta, f, indent=2)


In [None]:
# --- your feature sets ---
feature_sets = {
    "x_full":  x1,
    "x_style": x2,
    "x_emb": x3,
    "x_taaled": x4,
    "x_taaco": x5,
    "x_taassc": x6
}

# Ensure same number of rows across sets
n_rows = len(next(iter(feature_sets.values())))
assert all(len(v) == n_rows for v in feature_sets.values()), "All X sets must have same # rows"

# Target as numpy
y_arr = y.to_numpy() if hasattr(y, "to_numpy") else np.asarray(y)

# Shared CV (same splits for every feature set)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_ids = np.full(n_rows, -1, dtype=np.int16)  # will be filled during first run

os.makedirs(SAVE_DIR, exist_ok=True)

def to_numpy(X):
    return X if isinstance(X, np.ndarray) else X.to_numpy()

# Run CV for each feature set
for tag, X_any in feature_sets.items():
    X = to_numpy(X_any).astype(np.float32, copy=False)

    oof_pred = np.full(n_rows, np.nan, dtype=np.float32)
    # Train 5 folds
    for fold, (trn_idx, val_idx) in enumerate(tqdm(kf.split(X), total=5, desc=f"5-Fold CV ({tag})"), start=1):
        model = XGBRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        )
        model.fit(X[trn_idx], y_arr[trn_idx])
        m_dir = os.path.join(SAVE_DIR, tag)  # no leading slash
        os.makedirs(m_dir, exist_ok=True)
        model.save_model(os.path.join(m_dir, f"xgb_fold{fold}.json"))

        # store fold id once, using the first feature set
        if tag == list(feature_sets.keys())[0]:
            fold_ids[val_idx] = fold

        # OOF predictions for this feature set
        oof_pred[val_idx] = model.predict(X[val_idx]).astype(np.float32)

    # attach OOF column for this set
    df[f"xgb_oof_pred_{tag}"] = oof_pred

    # quick OOF RMSE for this set
    rmse = mean_squared_error(y_arr, oof_pred)
    print(f"[{tag}] OOF RMSE: {rmse:.5f}")

# attach cv fold (1..5) once
df["cv_fold"] = fold_ids

In [None]:
for tag in ["x_full", "x_style", "x_emb", "x_taaled", "x_taaco", "x_taassc"]:
    df[f"xgb_oof_pred_{tag}_int"] = (
        np.rint(df[f"xgb_oof_pred_{tag}"])  # round to nearest int
        .clip(1, 6)                         # keep within 1–6
        .astype(np.int16)                   # store as int16
    )

In [None]:
y_true = df["holistic_essay_score"].to_numpy()

# Map tags to descriptions
tag_labels = {
    "x_full":  "x_full (OHE + EMB + Style)",
    "x_style": "x_style (OHE + Style)",
    "x_emb": "x_emb (OHE + EMB)", 
    "x_taaled": "x_taaled (OHE + TAALED)",
    "x_taaco": "x_taaco (OHE + TAACO)",
    "x_taassc": "x_taassc (OHE + TAASSC)",
}

for tag, label in tag_labels.items():
    y_pred = df[f"xgb_oof_pred_{tag}"].to_numpy()

    rmse = mean_squared_error(y_true, y_pred)  # RMSE
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    print(f"{label}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

In [None]:
y_true = df["holistic_essay_score"]


for tag, label in tag_labels.items():
    y_pred = df[f"xgb_oof_pred_{tag}_int"]

    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")

    print(f"{label}: Accuracy={acc:.4f}, F1-macro={f1:.4f}")

In [None]:
y_pred = df[f"xgb_oof_pred_x_full_int"]

# confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=[1,2,3,4,5,6])

# normalize row-wise (%)
cm_percent = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] * 100

# plot
plt.figure(figsize=(8,6))
im = plt.imshow(cm_percent, interpolation="nearest", cmap="YlGnBu")
plt.colorbar(im, fraction=0.046, pad=0.04)

# annotate with %
for i in range(cm_percent.shape[0]):
    for j in range(cm_percent.shape[1]):
        plt.text(
            j, i, f"{cm_percent[i, j]:.1f}%",
            ha="center", va="center",
            color="black" if cm_percent[i, j] < 50 else "white"
        )

plt.xticks(ticks=np.arange(6), labels=[1,2,3,4,5,6])
plt.yticks(ticks=np.arange(6), labels=[1,2,3,4,5,6])
plt.xlabel("Predicted essay score")
plt.ylabel("True essay score")
plt.savefig(f"../tables/sat_2/confusion_matrix_high.png", dpi=300)
plt.show()

In [None]:
y_true = df["holistic_essay_score"]


for tag, label in tag_labels.items():
    y_pred = df[f"xgb_oof_pred_{tag}_int"]

    # confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=[1,2,3,4,5,6])

    # normalize row-wise (%)
    cm_percent = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] * 100

    # plot
    plt.figure(figsize=(8,6))
    im = plt.imshow(cm_percent, interpolation="nearest", cmap="YlGnBu")
    plt.colorbar(im, fraction=0.046, pad=0.04)

    # annotate with %
    for i in range(cm_percent.shape[0]):
        for j in range(cm_percent.shape[1]):
            plt.text(
                j, i, f"{cm_percent[i, j]:.1f}%",
                ha="center", va="center",
                color="black" if cm_percent[i, j] < 50 else "white"
            )

    plt.xticks(ticks=np.arange(6), labels=[1,2,3,4,5,6])
    plt.yticks(ticks=np.arange(6), labels=[1,2,3,4,5,6])
    plt.xlabel("Predicted essay score")
    plt.ylabel("True essay score")
    plt.title(f"Confusion Matrix - High SES Ranker — {label}")
    plt.show()


In [None]:
df.to_csv(f"{SAVE_DIR}{SAVE_NAME}", index=False)