In [None]:
# Core
import os
import numpy as np
import pandas as pd
import sys

# Embedding
from transformers import BertTokenizerFast, BertModel
import torch

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import spacy.util as _su
import tempfile

# Progress bars
from tqdm import tqdm
tqdm.pandas()  # apply/progress_apply ready

# Custom preprocessing & TAASSC
from utils.preproc import Preprocessor
import TAASSC_215_dev as tdev
from TAASSC_215_dev import LGR_Analysis, index_list

# Reproducibility
RANDOM_STATE = 42

# Target & features setup
TARGET_COL = "holistic_essay_score"
TEXT_COL = "text"
CATEGORICAL_COLS = ['gender', 'grade_level', 'race_ethnicity', 'economically_disadvantaged']

SAVE_DIR = "../data/rewrites/"  # Directory to save processed data

orig_cwd = os.getcwd()

---- 

In [None]:
academic = pd.read_csv("../data/input/academic_cleaned.csv")
good_student = pd.read_csv("../data/input/good_student_cleaned.csv")
neutral = pd.read_csv("../data/input/neutral_cleaned.csv")
no_descriptions = pd.read_csv("../data/input/no_descriptions_cleaned.csv")
simple_english = pd.read_csv("../data/input/simple_english_cleaned.csv")
original = pd.read_csv("../data/input/original_cleaned_prompt.csv")

In [None]:
dfs = {
    'academic': academic,
    'good_student': good_student,
    'neutral': neutral,
    'no_descriptions': no_descriptions,
    'simple_english': simple_english,
    'original': original
}

In [None]:
for name, x in dfs.items():
    dfs[name] = pd.get_dummies(x, columns=CATEGORICAL_COLS, drop_first=False, dummy_na=False)

----

### TAALED

In [None]:
sys.path.append("TAALED_1_4_1_Py3") 
import TAALED_1_4_1 as TAALED

class _Root:
    def update_idletasks(self): pass
TAALED.root = _Root()
TAALED.system = "L"  

# 3) SpaCy compatibility shim
if not hasattr(_su, "set_data_path"):
    _su.set_data_path = lambda *a, **k: None

# 4) Configure which indices to run
var_dict = {
    "aw": 1, "cw": 1, "fw": 1,
    "simple_ttr": 1, "root_ttr": 1, "log_ttr": 1, "maas_ttr": 1,
    "mattr": 1, "msttr": 1, "hdd": 1,
    "mltd": 1, "mltd_ma": 1, "mtld_wrap": 1,
    "indout": 0,
}

def _detect_filename_col(res: pd.DataFrame) -> str:
    candidates = {"filename","file","file_name","textname","doc","document","name"}
    for c in res.columns:
        if c.lower() in candidates:
            return c
    return res.columns[0]

def _add_prefix_once(df: pd.DataFrame, prefix: str, exclude: set) -> pd.DataFrame:
    """Prefix columns unless they already start with the prefix; leave 'exclude' alone."""
    rename_map = {}
    for c in df.columns:
        if c in exclude:
            continue
        if not c.startswith(prefix):
            rename_map[c] = f"{prefix}{c}"
    return df.rename(columns=rename_map)

def _run_taaled_on_df(df: pd.DataFrame) -> pd.DataFrame:
    """Run TAALED on a single df and merge results (prefixed with 'taaled_')."""
    with tempfile.TemporaryDirectory() as tmp_in:
        # write each text as {index}.txt
        for i, txt in df["text"].items():
            with open(os.path.join(tmp_in, f"{i}.txt"), "w", encoding="utf-8") as f:
                f.write(txt if isinstance(txt, str) else "")

        out_csv = os.path.join(tmp_in, "taaled_out.csv")
        TAALED.main(tmp_in, out_csv, var_dict)

        # load results
        res = pd.read_csv(out_csv)

    # normalize filename -> index
    fn_col = _detect_filename_col(res)
    res["__idx__"] = res[fn_col].astype(str).str.replace(".txt", "", regex=False)

    # ensure prefix 'taaled_' on all metric columns
    exclude_cols = {fn_col, "__idx__"}
    res = _add_prefix_once(res, prefix="taaled_", exclude=exclude_cols).drop(columns=[fn_col])

    # merge back
    df["__idx__"] = df.index.astype(str)
    df = df.merge(res, how="left", on="__idx__").drop(columns="__idx__")
    return df


In [None]:
# ---------- UPDATE THE ORIGINAL `dfs` IN PLACE ----------
print("Running TAALED on multiple DataFrames (updating `dfs` in place):")
for name, d in dfs.items():
    print(f"  • {name}: {len(d)} texts")
    dfs[name] = _run_taaled_on_df(d)

os.chdir(orig_cwd)

print("All DataFrames in `dfs` now include TAALED metrics with the 'taaled_' prefix (no duplicates).")

In [None]:
for name, x in dfs.items():
    x.to_csv(f"{SAVE_DIR}{name}_taaled.csv", index=False)

------

### TAACO

In [None]:
sys.path.append("TAACO")
from TAACOnoGUI import runTAACO

# 3️TAACO options (customize as needed)
opts = {
    "sourceKeyOverlap": False, "sourceLSA": False, "sourceLDA": False, "sourceWord2vec": False,
    "wordsAll": True, "wordsContent": True, "wordsFunction": True,
    "wordsNoun": True, "wordsPronoun": True, "wordsArgument": True,
    "wordsVerb": True, "wordsAdjective": True, "wordsAdverb": True,
    "overlapSentence": True, "overlapParagraph": True,
    "overlapAdjacent": True, "overlapAdjacent2": True,
    "otherTTR": True, "otherConnectives": True, "otherGivenness": True,
    "overlapLSA": True, "overlapLDA": True, "overlapWord2vec": True,
    "overlapSynonym": True, "overlapNgrams": True,
    "outputTagged": False, "outputDiagnostic": False,
}

# Helper: detect TAACO filename column
def _detect_filename_column(df_out):
    for c in df_out.columns:
        if c.lower() in {"filename","file","file_name","textname","doc","document","name"}:
            return c
    for c in df_out.columns:
        if df_out[c].astype(str).str.endswith(".txt").any():
            return c
    return df_out.columns[0]

def _add_prefix_once(df: pd.DataFrame, prefix: str, exclude: set) -> pd.DataFrame:
    """Prefix columns unless already prefixed."""
    rename_map = {}
    for c in df.columns:
        if c in exclude:
            continue
        if not c.startswith(prefix):
            rename_map[c] = f"{prefix}{c}"
    return df.rename(columns=rename_map)

# Core function: run TAACO on one df and merge results
def _run_taaco_on_df(df: pd.DataFrame) -> pd.DataFrame:
    with tempfile.TemporaryDirectory() as tmp_in:
        # write each essay/text to temp folder
        for i, txt in df["text"].items():
            with open(os.path.join(tmp_in, f"{i}.txt"), "w", encoding="utf-8") as f:
                f.write(txt if isinstance(txt, str) else "")

        out_csv = os.path.join(tmp_in, "taaco_out.csv")
        runTAACO(tmp_in, out_csv, opts)

        # read TAACO results
        res = pd.read_csv(out_csv)
        fn_col = _detect_filename_column(res)
        res["__idx__"] = res[fn_col].astype(str).str.replace(".txt", "", regex=False)

        # ensure prefix 'taaco_' on all metric columns
        exclude_cols = {fn_col, "__idx__"}
        res = _add_prefix_once(res, prefix="taaco_", exclude=exclude_cols).drop(columns=[fn_col])

        # merge results back
        df["__idx__"] = df.index.astype(str)
        df = df.merge(res, how="left", on="__idx__").drop(columns="__idx__")
        return df

In [None]:

print("Running TAACO on multiple DataFrames (updating `dfs` in place):")
for name, d in dfs.items():
    print(f"  • {name}: {len(d)} texts")
    dfs[name] = _run_taaco_on_df(d)

os.chdir(orig_cwd)

print("All DataFrames in `dfs` now include TAACO metrics with the 'taaco_' prefix (no duplicates).")

In [None]:
for name, x in dfs.items():
    x.to_csv(f"{SAVE_DIR}{name}_taaled_taaco.csv", index=False)

----

### TAASSC

In [None]:
PREFIX = "taassc_"        

print("Running TAASSC on multiple DataFrames (updating `dfs` in place):")

for name, df in dfs.items():
    if TEXT_COL not in df.columns:
        raise KeyError(f"{name}: expected a '{TEXT_COL}' column. Got: {list(df.columns)[:12]}...")

    print(f"\nComputing TAASSC metrics for: {name} (n={len(df)})")

    # If re-running, drop any existing TAASSC-prefixed columns
    taassc_cols_existing = [c for c in df.columns if c.startswith(PREFIX)]
    if taassc_cols_existing:
        df = df.drop(columns=taassc_cols_existing)

    records = []
    for txt in tqdm(df[TEXT_COL].fillna(""), desc=f"TAASSC → {name}", total=len(df)):
        try:
            res = LGR_Analysis(txt)  # returns a dict for metrics in index_list
            row = {metr: res.get(metr, np.nan) for metr in index_list}
        except Exception as e:
            print(f"[{name}] Error processing text: {str(txt)[:100]}... -> {e}")
            row = {metr: np.nan for metr in index_list}
        records.append(row)

    # Build metrics DataFrame with prefixed column names
    metrics_df = pd.DataFrame.from_records(records, index=df.index)
    metrics_df.columns = [f"{PREFIX}{metr}" for metr in index_list]

    # Merge back and update dict in place
    df_out = pd.concat([df, metrics_df], axis=1)
    dfs[name] = df_out

    print(f" {name}: TAASSC metrics added ({len(index_list)} features). New shape: {df_out.shape}")

print("\n All DataFrames in `dfs` updated with TAASSC metrics (prefix='taassc_', TEXT_COL='text').")

In [None]:
for name, x in dfs.items():
    x.to_csv(f"{SAVE_DIR}{name}_full.csv", index=False)

-----

### Embeddings

In [None]:
device = torch.device('mps') if (torch.backends.mps.is_available()) else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').eval().to(device)

def get_embeddings(text: str) -> np.ndarray:
    """
    Run one text through BERT, return the [CLS] embedding as a numpy vector.
    """
    inputs = tokenizer(text,
                       return_tensors='pt',
                       padding=True,
                       truncation=True,
                       max_length=512)
    # move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # [batch=1, seq, dim] → pick CLS token embedding
    cls_emb = outputs.last_hidden_state[0, 0, :]
    return cls_emb.cpu().numpy()

In [None]:
os.makedirs("../embeddings/", exist_ok=True)

for i, df_i in dfs.items():
    emb_list = []

    for txt in tqdm(df_i["text"], desc=f"Rewritten {i}"):
        emb = get_embeddings(txt)
        emb_list.append(emb)

    X_emb = np.vstack(emb_list)
    np.save(f"../embeddings/embeddings_{i}.npy", X_emb)

print("All rewrite embeddings computed and saved.")