In [1]:
# # CERAS — Data exploration & cleaning

# This notebook:
# - Inspects schemas and basic stats
# - Cleans and extracts behavioral features from **MEU-Mobile KSD**
# - Parses reasoning traces from **REVEAL** (open + eval splits)
# - Produces processed CSVs into `data/processed/`
# - Saves some simple visualizations (histograms, counts)


In [2]:
import pandas as pd
import numpy as np

meu_raw = pd.read_excel('./raw/MEU-Mobile KSD 2016.xlsx')
reveal_raw = pd.read_csv('./raw/reveal_open.csv')
reveal_eval_raw = pd.read_csv('./raw/reveal_eval.csv')

In [4]:
# What we're trying: make sure the files loaded correctly and see basic shape.
print("MEU dataframe:", type(meu_raw), "shape:", getattr(meu_raw, "shape", None))
print("REVEAL open shape:", reveal_raw.shape)
print("REVEAL eval shape:", reveal_eval_raw.shape)


MEU dataframe: <class 'pandas.core.frame.DataFrame'> shape: (2911, 72)
REVEAL open shape: (1146, 50)
REVEAL eval shape: (4956, 50)


In [5]:
meu_raw.head()

Unnamed: 0,Subject,Hold .,Hold t,Hold i,Hold e,Hold Shift,Hold 5,Hold Shift.1,Hold Caps,Hold r,...,Size Caps,Size r,Size o,Size a,Size n,Size l,Size Enter,AvH,AvP,AvA
0,1.0,89.0,92.0,64.0,85.0,123.0,82.0,70.0,101.0,84.0,...,0.225806,0.225806,0.322581,0.290323,0.225806,0.354839,0.258065,88.071429,0.190971,0.288018
1,1.0,90.0,88.0,99.0,83.0,123.0,101.0,81.0,94.0,88.0,...,0.225806,0.225806,0.322581,0.322581,0.258065,0.387097,0.322581,92.071429,0.186514,0.274194
2,1.0,87.0,90.0,83.0,65.0,79.0,73.0,96.0,62.0,64.0,...,0.225806,0.193548,0.290323,0.322581,0.225806,0.258065,0.225806,83.571429,0.186171,0.271889
3,1.0,71.0,81.0,62.0,72.0,83.0,94.0,89.0,104.0,73.0,...,0.225806,0.225806,0.258065,0.290323,0.225806,0.322581,0.322581,81.428571,0.177943,0.278802
4,1.0,89.0,72.0,82.0,82.0,62.0,89.0,68.0,88.0,69.0,...,0.290323,0.225806,0.322581,0.258065,0.225806,0.290323,0.387097,77.428571,0.1632,0.28341


In [6]:
meu = meu_raw.copy()

In [8]:
meu.dropna(axis=1, how='all', inplace=True)

In [9]:
original_meu_cols = meu.columns.tolist()
for col in original_meu_cols:
    meu[col] = pd.to_numeric(meu[col], errors="coerce")

In [10]:
thresh = int(0.1 * len(meu))
meu = meu.dropna(axis=1, thresh=thresh)

In [11]:
# Reset index and create a synthetic index column
meu = meu.reset_index(drop=True)
meu['meu_row_index'] = meu.index

In [12]:
# Drop rows that are entirely NaN (if any)
meu = meu.dropna(axis=0, how="all").reset_index(drop=True)

In [13]:
meu.shape

(2911, 73)

In [20]:
reveal = reveal_raw.copy()
# Identify and remove text-heavy fields if they exist
text_fields = ['question', 'full_answer', 'step', 'decontextualized_step', 'answer']
text_fields = [c for c in text_fields if c in reveal.columns]

In [21]:
# Keep small identifiers + numeric behavioral columns
keep_cols = ['question_id', 'answer_model', 'step_idx']
numeric_cols = reveal.select_dtypes(include=[np.number]).columns.tolist()
reveal_keep = [c for c in keep_cols if c in reveal.columns] + numeric_cols

In [22]:
seen = set()
reveal_keep = [x for x in reveal_keep if not (x in seen or seen.add(x))]

In [23]:
reveal_eval = reveal_eval_raw.copy()
# Identify and remove text-heavy fields if they exist
text_fields = ['question', 'full_answer', 'step', 'decontextualized_step', 'answer']
text_fields = [c for c in text_fields if c in reveal_eval.columns]

In [24]:
# Keep small identifiers + numeric behavioral columns
keep_cols = ['question_id', 'answer_model', 'step_idx']
numeric_cols = reveal_eval.select_dtypes(include=[np.number]).columns.tolist()
reveal_keep_eval = [c for c in keep_cols if c in reveal_eval.columns] + numeric_cols

In [25]:
seen_eval = set()
reveal_keep_eval = [x for x in reveal_keep_eval if not (x in seen_eval or seen_eval.add(x))]