# SCORING NOTEBOOK 
- requires previous low_scorer and high_scorer runs

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import json
tqdm.pandas()

from xgboost import XGBRegressor

# Reproducibility
RANDOM_STATE = 42

# Target & features setup
TARGET_COL = "holistic_essay_score"
TEXT_COL = "text"
CATEGORICAL_COLS = ['gender', 'grade_level', 'race_ethnicity', 'economically_disadvantaged']

HIGH_META_PATH = "../feature_meta_high.json"
LOW_META_PATH  = "feature_meta_low.json"

REWRITES_DIR = "../data/processed/sat/"
REWRITES_PREFIX = "rewrite_{}.csv"

EMB_DIR = "../embeddings/sat/"
EMB_PREFIX = "embeddings_rewrite_{}.npy"

SAVE_DIR = "../data/results/sat/"
SAVE_NAME = "data_sat_scored.csv"

MODEL_DIR = "../model/run_01/"

META_KEY_PATH = MODEL_DIR + "/high/feature_meta.json"

# Rewrites
REWRITE = "paraphrase"  # "paraphrase" or "random"

In [2]:
original = pd.read_csv(REWRITES_DIR + "original.csv")
original = original.drop(columns=original.filter(like="xgb_").columns)
original_emb = np.load(EMB_DIR + "embeddings_original.npy")

DATASET_REW = {"original": original}
DATASET_EMB = {"original": original_emb}

for i in range (1,7):
    DATASET_REW[i] = pd.read_csv(REWRITES_DIR+REWRITES_PREFIX.format(i))
    DATASET_EMB[i] = np.load(EMB_DIR+EMB_PREFIX.format(i))

In [3]:
with open(META_KEY_PATH, "r") as f:
    feature_meta = json.load(f)

# Column order for the *tabular* part of each group
feature_cols_by_group = {
    "style":  feature_meta["style"],    # full style block (OHE + TAAL/TAACO/TAASSC)
    "emb":    feature_meta["emb_ohe"],  # OHE-only block used in embedding model
    "taaled": feature_meta["taaled"],
    "taaco":  feature_meta["taaco"],
    "taassc": feature_meta["taassc"],
    "full":   feature_meta["style"],    # full = style cols + embeddings
}

emb_dim = feature_meta["emb_dim"]

In [4]:
import uuid

# 1) sanity check: all DFs must have same length as `original`
n = DATASET_REW['original'].shape[0]

# 2) generate one UUID per paper
paper_ids = [str(uuid.uuid4()) for _ in range(n)]

# 3) assign to every DF + set cv_fold = 1
for df_ in list(DATASET_REW.values()):
    df_.loc[:, "paper_id"] = paper_ids

cv_lookup = dict(zip(original["text"], original["cv_fold"]))

for key, df_ in DATASET_REW.items():
    if key == "original":
        continue 

    df_enc = pd.get_dummies(df_, columns=CATEGORICAL_COLS, drop_first=False)

    DATASET_REW[key] = df_enc

In [5]:
def load_xgb(path: Path) -> XGBRegressor:
    m = XGBRegressor()
    m.load_model(str(path))
    return m

# Map your logical group names to their folder names
SUBFOLDERS = {
    "full":   "x_full",
    "style":  "x_style",
    "emb":    "x_emb",
    "taassc": "x_taassc",
    "taaco":  "x_taaco",
    "taaled": "x_taaled",
}

def build_models(root=None, folds=5):
    root = Path(root)
    return {
        group: {
            side: {
                i: load_xgb(root / f"{side}" / sub / f"xgb_fold{i}.json")
                for i in range(1, folds + 1)
            }
            for side in ("high", "low")
        }
        for group, sub in SUBFOLDERS.items()
    }

models = build_models(MODEL_DIR)

In [6]:
name_to_k = {
    "original": 0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
    6: 6,
}

In [7]:
from utils.compare_df import compare_columns

compare_columns(DATASET_REW['original'], DATASET_REW[1], 'og', 'rew')

Columns only in og (0): []
Columns only in rew (2): ['content_preserved', 'rewritten_text']
Common columns (376). Example: ['text', 'holistic_essay_score', 'prompt_name', 'gender_F', 'gender_M', 'grade_level_6.0', 'grade_level_8.0', 'grade_level_9.0', 'grade_level_10.0', 'grade_level_11.0']


In [None]:
# groups = ["full", "style", "emb", "taaled", "taaco", "taassc"]

# rows = []

# for name, df_ in DATASET_REW.items():

#     # --- Build feature DF x (keep df_ unchanged)
#     if name != "original":
#         x = df_.drop(columns=["full_text","text_tokens",f"content_preserved",f"rewritten_text","text"], errors="ignore").copy()
#     else:
#         x = df_.drop(columns=["full_text","text_tokens","text"], errors="ignore").copy()

#     # --- Keep references for output
#     true_score = x["holistic_essay_score"].copy()
#     paper_ids = x["paper_id"].copy()
#     low_ses = x["economically_disadvantaged_1"].copy()
#     race = x["race_ethnicity_White"].copy()
#     gender = x["gender_M"].copy()
#     prompt = x["prompt_name"].copy()

#     grade_cols = [c for c in x.columns if c.startswith("grade_level_")]
#     grade = x[grade_cols].idxmax(axis=1).str.replace("grade_level_", "").astype(float)

#     cv_fold = x["cv_fold"].astype(int).copy()

#     # --- Remove label/meta cols from features
#     x = x.drop(columns=["paper_id","cv_fold","holistic_essay_score","prompt_name"], errors="ignore")

#     # --- Column lists (derived from *x*, not global)
#     ohe_cols = [c for c in x.columns if c.startswith("gender") or c.startswith("grade_level") or c.startswith("race_") or c.startswith("economically_disadvantaged")]
#     taaled_cols = [c for c in x.columns if c.startswith("taaled_")]
#     taaco_cols = [c for c in x.columns if c.startswith("taaco_")]
#     taassc_cols = [c for c in x.columns if c.startswith("taassc_")]

#     # --- Build feature matrices for each scorer
#     X_emb = DATASET_EMB[name]  # embedding matrix aligned row-wise to df_
#     X_by_group = {
#         "full": np.hstack([x.to_numpy(), X_emb]),
#         "style": x.to_numpy(),
#         "emb": np.hstack([x[ohe_cols].to_numpy(), X_emb]),
#         "taaled": x[ohe_cols + taaled_cols].to_numpy(),
#         "taaco": x[ohe_cols + taaco_cols].to_numpy(),
#         "taassc": x[ohe_cols + taassc_cols].to_numpy(),
#     }

#     # --- Prepare output arrays for all groups/sides
#     preds = {g: {"high": np.empty(len(x), float), "low": np.empty(len(x), float)} for g in groups}

#     # --- Predict per fold for each group
#     for f in (1, 2, 3, 4, 5):
#         mask = (cv_fold == f)
#         if not np.any(mask):
#             continue
#         for g in groups:
#             Xf = X_by_group[g][mask]
#             preds[g]["high"][mask] = models[g]["high"][f].predict(Xf)
#             preds[g]["low"][mask]  = models[g]["low"][f].predict(Xf)

#     # --- Assemble row block with 14 score columns
#     out = {
#         "essay_id": paper_ids.values,
#         "k": np.full(len(paper_ids), name_to_k[name], dtype=int),
#         "true_score": true_score.values,
#         "low_SES": low_ses.values,
#         "race_white": race.values,
#         "gender_male": gender.values,
#         "prompt_name": prompt.values,
#         "grade_level": grade.values,
#         "cv_fold": cv_fold.values,
#     }
#     for g in groups:
#         out[f"score_high_{g}"] = preds[g]["high"]
#         out[f"score_low_{g}"]  = preds[g]["low"]

#     rows.append(pd.DataFrame(out))

# results_df = pd.concat(rows, ignore_index=True)

In [8]:
groups = ["full", "style", "emb", "taaled", "taaco", "taassc"]

rows = []

for name, df_ in DATASET_REW.items():

    # --- Build feature DF x (keep df_ unchanged)
    if name != "original":
        x = df_.drop(
            columns=[
                "full_text", "text_tokens", "content_preserved",
                "rewritten_text", "text"
            ],
            errors="ignore"
        ).copy()
    else:
        x = df_.drop(
            columns=["full_text", "text_tokens", "text"],
            errors="ignore"
        ).copy()

    # --- Keep references for output
    true_score = x["holistic_essay_score"].copy()
    paper_ids  = x["paper_id"].copy()
    low_ses    = x["economically_disadvantaged_1"].copy()
    race       = x["race_ethnicity_White"].copy()
    gender     = x["gender_M"].copy()
    prompt     = x["prompt_name"].copy()

    grade_cols = [c for c in x.columns if c.startswith("grade_level_")]
    grade      = (
        x[grade_cols]
        .idxmax(axis=1)
        .str.replace("grade_level_", "")
        .astype(float)
    )

    cv_fold = x["cv_fold"].astype(int).copy()

    # --- Remove label/meta cols from features
    x = x.drop(
        columns=["paper_id", "cv_fold", "holistic_essay_score", "prompt_name"],
        errors="ignore"
    )

    # --- Optional sanity check: all required cols exist
    x_cols = set(x.columns)
    for g in ["style", "emb", "taaled", "taaco", "taassc", "full"]:
        missing = set(feature_cols_by_group[g]) - x_cols
        if missing:
            raise ValueError(f"Missing columns for group '{g}' in dataset '{name}': {missing}")

    # --- Build feature matrices for each scorer using saved order
    X_emb = DATASET_EMB[name]  # embedding matrix aligned row-wise to df_

    X_by_group = {
        "style":  x[feature_cols_by_group["style"]].to_numpy(),
        "full":   np.hstack([
                     x[feature_cols_by_group["full"]].to_numpy(),
                     X_emb
                 ]),
        "emb":    np.hstack([
                     x[feature_cols_by_group["emb"]].to_numpy(),
                     X_emb
                 ]),
        "taaled": x[feature_cols_by_group["taaled"]].to_numpy(),
        "taaco":  x[feature_cols_by_group["taaco"]].to_numpy(),
        "taassc": x[feature_cols_by_group["taassc"]].to_numpy(),
    }

    # --- Prepare output arrays for all groups/sides
    preds = {
        g: {
            "high": np.empty(len(x), float),
            "low":  np.empty(len(x), float),
        }
        for g in groups
    }

    # --- Predict per fold for each group
    for f in (1, 2, 3, 4, 5):
        mask = (cv_fold == f)
        if not np.any(mask):
            continue
        for g in groups:
            Xf = X_by_group[g][mask]
            preds[g]["high"][mask] = models[g]["high"][f].predict(Xf)
            preds[g]["low"][mask]  = models[g]["low"][f].predict(Xf)

    # --- Assemble row block with 14 score columns
    out = {
        "essay_id":    paper_ids.values,
        "k":           np.full(len(paper_ids), name_to_k[name], dtype=int),
        "true_score":  true_score.values,
        "low_SES":     low_ses.values,
        "race_white":  race.values,
        "gender_male": gender.values,
        "prompt_name": prompt.values,
        "grade_level": grade.values,
        "cv_fold":     cv_fold.values,
    }
    for g in groups:
        out[f"score_high_{g}"] = preds[g]["high"]
        out[f"score_low_{g}"]  = preds[g]["low"]

    rows.append(pd.DataFrame(out))

results_df = pd.concat(rows, ignore_index=True)


In [11]:
results_df.to_csv(f"{SAVE_DIR}{SAVE_NAME}")

In [9]:
results_df.head()

Unnamed: 0,essay_id,k,true_score,low_SES,race_white,gender_male,prompt_name,grade_level,cv_fold,score_high_full,...,score_high_style,score_low_style,score_high_emb,score_low_emb,score_high_taaled,score_low_taaled,score_high_taaco,score_low_taaco,score_high_taassc,score_low_taassc
0,67570e05-9010-4f23-bfb3-6727d0bdbf52,0,1,False,False,True,Facial action coding system,10.0,2,1.510331,...,1.688369,1.720933,1.423283,1.185211,1.322788,1.858505,2.065959,1.877284,1.580307,1.833736
1,34de3178-d2c8-41c5-b4f8-23515c4c5415,0,1,False,True,True,"""A Cowboy Who Rode the Waves""",6.0,1,1.508305,...,1.614392,2.262538,2.039971,1.834972,1.788552,2.075867,1.570694,2.026517,1.969121,2.191933
2,6ae767e9-a9e2-4e92-b58e-4ae9e5782be5,0,1,False,False,False,Driverless cars,10.0,2,1.971364,...,1.910707,1.811026,2.347559,1.867283,1.886353,1.98302,2.266404,1.746567,1.942335,1.612309
3,8ce81be3-31a4-4371-96a0-20b5edaab65c,0,1,False,True,True,"""A Cowboy Who Rode the Waves""",6.0,1,1.606889,...,1.553044,1.558396,1.265636,1.413,1.576103,1.500084,1.52812,2.039864,1.472583,1.523446
4,473a5d1a-b132-453f-aa79-67a54819b277,0,1,False,True,True,The Face on Mars,8.0,3,1.569121,...,2.065023,1.738528,1.895425,1.553763,2.277539,1.968696,2.279782,1.967162,1.939634,2.037477
