# SCORING NOTEBOOK 
- requires previous low_scorer and high_scorer runs

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()

# Reproducibility
RANDOM_STATE = 42

# Target & features setup
TARGET_COL = "holistic_essay_score"
TEXT_COL = "text"
CATEGORICAL_COLS = ['gender', 'grade_level', 'race_ethnicity', 'economically_disadvantaged']

SAVE_DIR = "../data/results/"
SAVE_NAME = "data_no_desc_scored_final.csv"

# Rewrites
REWRITE = "paraphrase"  # "paraphrase" or "random"

In [None]:
# academic = pd.read_csv("../cleaned_data/rewrites/prompted/academic_full.csv")
# good_student = pd.read_csv("../cleaned_data/rewrites/prompted/good_student_full.csv")
# neutral = pd.read_csv("../cleaned_data/rewrites/prompted/neutral_full.csv")
# no_descriptions = pd.read_csv("../cleaned_data/rewrites/prompted/no_descriptions_full.csv")
# simple_english = pd.read_csv("../cleaned_data/rewrites/prompted/simple_english_full.csv")
# original = pd.read_csv("../cleaned_data/rewrites/prompted/original_full_fold.csv")

# academic_emb = np.load("../embeddings/scorer_input/embeddings_academic.npy")
# good_student_emb = np.load("../embeddings/scorer_input/embeddings_good_student.npy")
# neutral_emb = np.load("../embeddings/scorer_input/embeddings_neutral.npy")
# no_descriptions_emb = np.load("../embeddings/scorer_input/embeddings_no_descriptions.npy")
# simple_english_emb = np.load("../embeddings/scorer_input/embeddings_simple_english.npy")
# original_emb = np.load("../embeddings/scorer_input/embeddings_original.npy")

In [None]:
original = pd.read_csv("../data/full/data_full_no_preproc.csv")
rew_0 = pd.read_csv("../data/rewrites/no_desc/no_desc_rewritten_0_cleaned.csv")
rew_1 = pd.read_csv("../data/rewrites/no_desc/no_desc_rewritten_1_cleaned.csv")
rew_2 = pd.read_csv("../data/rewrites/no_desc/no_desc_rewritten_2_cleaned.csv")
rew_3 = pd.read_csv("../data/rewrites/no_desc/no_desc_rewritten_3_cleaned.csv")
rew_4 = pd.read_csv("../data/rewrites/no_desc/no_desc_rewritten_4_cleaned.csv")
rew_5 = pd.read_csv("../data/rewrites/no_desc/no_desc_rewritten_5_cleaned.csv")

original_emb = np.load("../embeddings/embeddings_original_full.npy")
rew_0_emb = np.load("../embeddings/no_desc/embeddings_rewritten_0.npy")
rew_1_emb = np.load("../embeddings/no_desc/embeddings_rewritten_1.npy")
rew_2_emb = np.load("../embeddings/no_desc/embeddings_rewritten_2.npy")
rew_3_emb = np.load("../embeddings/no_desc/embeddings_rewritten_3.npy")
rew_4_emb = np.load("../embeddings/no_desc/embeddings_rewritten_4.npy")
rew_5_emb = np.load("../embeddings/no_desc/embeddings_rewritten_5.npy")

DATASET_REW = {
    "original": original,
    0: rew_0,
    1: rew_1,
    2: rew_2,
    3: rew_3,
    4: rew_4,
    5: rew_5
}

DATASET_EMB = {
    "original": original_emb,
    0: rew_0_emb,
    1: rew_1_emb,
    2: rew_2_emb,
    3: rew_3_emb,
    4: rew_4_emb,
    5: rew_5_emb
}

In [None]:
import uuid

# 1) sanity check: all DFs must have same length as `original`
n = original.shape[0]
assert all(len(df_) == n for df_ in list(DATASET_REW.values())), \
    "Lengths differ between original and rewrites."

# 2) generate one UUID per paper
paper_ids = [str(uuid.uuid4()) for _ in range(n)]

# 3) assign to every DF + set cv_fold = 1
for df_ in list(DATASET_REW.values()):
    df_.loc[:, "paper_id"] = paper_ids

for df_ in list(DATASET_REW.values()):
    df_["cv_fold"] = original['cv_fold']


In [None]:
def load_xgb(path: Path) -> XGBRegressor:
    m = XGBRegressor()
    m.load_model(str(path))
    return m

# Map your logical group names to their folder names
SUBFOLDERS = {
    "full":   "x_full",
    "style":  "x_style",
    "emb":    "x_emb",
    "taassc": "x_taassc",
    "taaco":  "x_taaco",
    "taaled": "x_taaled",
}

def build_models(root="../cleaned_data", folds=5):
    root = Path(root)
    return {
        group: {
            side: {
                i: load_xgb(root / f"model_{side}" / sub / f"xgb_fold{i}.json")
                for i in range(1, folds + 1)
            }
            for side in ("high", "low")
        }
        for group, sub in SUBFOLDERS.items()
    }

models = build_models()

In [None]:
name_to_k = {
    "original": 0,
    0: 1,
    1: 2,
    2: 3,
    3: 4,
    4: 5,
    5: 6,
}

In [None]:
groups = ["full", "style", "emb", "taaled", "taaco", "taassc"]

rows = []

for name, df_ in DATASET_REW.items():

    # --- Build feature DF x (keep df_ unchanged)
    if name != "original":
        x = df_.drop(columns=["full_text","text_tokens",f"content_preserved_{name}",f"rewritten_text_{name}","text"], errors="ignore").copy()
    else:
        x = df_.drop(columns=["full_text","text_tokens","text"], errors="ignore").copy()

    # --- Keep references for output
    true_score = x["holistic_essay_score"].copy()
    paper_ids  = x["paper_id"].copy()
    low_ses    = x["economically_disadvantaged_1"].copy()
    race       = x["race_ethnicity_White"].copy()
    gender     = x["gender_M"].copy()
    prompt     = x["prompt_name"].copy()
    cv_fold    = x["cv_fold"].astype(int).copy()

    # --- Remove label/meta cols from features
    x = x.drop(columns=["paper_id","cv_fold","holistic_essay_score","prompt_name"], errors="ignore")

    # --- Column lists (derived from *x*, not global)
    ohe_cols     = [c for c in x.columns if c.startswith("gender") or c.startswith("grade_level") or c.startswith("race_") or c.startswith("economically_disadvantaged")]
    taaled_cols  = [c for c in x.columns if c.startswith("taaled_")]
    taaco_cols   = [c for c in x.columns if c.startswith("taaco_")]
    taassc_cols  = [c for c in x.columns if c.startswith("taassc_")]

    # --- Build feature matrices for each scorer
    X_emb = DATASET_EMB[name]  # embedding matrix aligned row-wise to df_
    X_by_group = {
        "full":    np.hstack([x.to_numpy(), X_emb]),
        "style":   x.to_numpy(),
        "emb":     np.hstack([x[ohe_cols].to_numpy(), X_emb]),
        "taaled":  x[ohe_cols + taaled_cols].to_numpy(),
        "taaco":   x[ohe_cols + taaco_cols].to_numpy(),
        "taassc":  x[ohe_cols + taassc_cols].to_numpy(),
    }

    # --- Prepare output arrays for all groups/sides
    preds = {g: {"high": np.empty(len(x), float), "low": np.empty(len(x), float)} for g in groups}

    # --- Predict per fold for each group
    for f in (1, 2, 3, 4, 5):
        mask = (cv_fold == f)
        if not np.any(mask):
            continue
        for g in groups:
            Xf = X_by_group[g][mask]
            preds[g]["high"][mask] = models[g]["high"][f].predict(Xf)
            preds[g]["low"][mask]  = models[g]["low"][f].predict(Xf)

    # --- Assemble row block with 14 score columns
    out = {
        "essay_id": paper_ids.values,
        "k": np.full(len(paper_ids), name_to_k[name], dtype=int),
        "true_score": true_score.values,
        "low_SES": low_ses.values,
        "race_white": race.values,
        "gender_male": gender.values,
        "prompt_name": prompt.values,
        "cv_fold": cv_fold.values,
    }
    for g in groups:
        out[f"score_high_{g}"] = preds[g]["high"]
        out[f"score_low_{g}"]  = preds[g]["low"]

    rows.append(pd.DataFrame(out))

results_df = pd.concat(rows, ignore_index=True)


In [None]:
results_df.to_csv(f"{SAVE_DIR}{SAVE_NAME}", index=False)

In [None]:
results_df.head()