In [3]:
# --- CELDA 1: RUTAS Y LIBRER√çAS ---

from pathlib import Path
import pandas as pd, numpy as np, re

# Busca la ra√≠z del proyecto (para poder ejecutar desde /notebooks)
def find_root(start=None, max_up=6):
    p = Path.cwd() if start is None else Path(start)
    for _ in range(max_up):
        if (p / "data" / "raw").exists():
            return p
        p = p.parent
    raise FileNotFoundError("No se encontr√≥ la carpeta 'data/raw'.")

ROOT      = find_root()
DATA_RAW  = ROOT / "data" / "raw"
DATA_PROC = ROOT / "data" / "processed"
REPORTS   = ROOT / "reports" / "tables"
for d in (DATA_PROC, REPORTS): d.mkdir(parents=True, exist_ok=True)

print("Rutas configuradas correctamente ‚úÖ")


Rutas configuradas correctamente ‚úÖ


In [4]:
# --- CELDA 2: CARGA DEL EXCEL PER√ö ---

EXCEL_FILE = DATA_RAW / "PERU_PISA2022.xlsx"
assert EXCEL_FILE.exists(), f"No se encontr√≥ {EXCEL_FILE}"

per = pd.read_excel(EXCEL_FILE, engine="openpyxl")
per.columns = [c.upper().strip() for c in per.columns]
print("Archivo cargado:", EXCEL_FILE.name)
print("Shape:", per.shape)


Archivo cargado: PERU_PISA2022.xlsx
Shape: (6968, 1278)


In [5]:
# --- CELDA 3: DETECTAR COLUMNAS PV DE MATH ---

def detect_pv_cols(columns, domain="MATH", k=10):
    pat = re.compile(rf"PV(\d+){domain}$", re.IGNORECASE)
    found = [(int(pat.match(c).group(1)), c) for c in columns if pat.match(c)]
    if found:
        found.sort()
        return [c for _, c in found]
    return sorted([c for c in columns if c.startswith("PV") and domain in c])

pv_math = detect_pv_cols(per.columns, "MATH", 10)
assert len(pv_math) >= 5, f"No se detectaron suficientes PVs de MATH. Detectados: {pv_math}"
print("PVs detectados:", pv_math[:3], "...")


PVs detectados: ['PV1MATH', 'PV2MATH', 'PV3MATH'] ...


In [8]:
# --- Reemplaza tus funciones/umbrales por estos ---

# Cortes PISA 2022 (matem√°ticas). Nota: 606.99 es el corte correcto entre 4 y 5.
THR8 = [295.47, 357.77, 420.07, 482.38, 544.68, 606.99, 669.30]

def score_to_level8(s):
    if pd.isna(s): return np.nan
    if s < THR8[0]: return "1c"
    if s < THR8[1]: return "1b"
    if s < THR8[2]: return "1a"
    if s < THR8[3]: return "2"
    if s < THR8[4]: return "3"
    if s < THR8[5]: return "4"
    if s < THR8[6]: return "5"
    return "6"

def level8_to_3(label):
    if pd.isna(label): return np.nan
    if label in {"1c","1b","1a"}: return "LOW"
    if label in {"2","3","4"}:    return "MED"
    if label in {"5","6"}:        return "HIGH"
    return np.nan

per["MATH_MEAN"]       = per[pv_math].mean(axis=1)
per["MATH_LEVEL_8LAB"] = per["MATH_MEAN"].apply(score_to_level8)
per["MATH_LEVEL_3CAT"] = per["MATH_LEVEL_8LAB"].apply(level8_to_3)

# Conteos impresos (sin guardar nada):
orden = ["1c","1b","1a","2","3","4","5","6"]
print("Muestras por nivel (1c‚Äì6):")
print(per["MATH_LEVEL_8LAB"].value_counts().reindex(orden).fillna(0).astype(int))

print("\nMuestras por categor√≠a (LOW/MED/HIGH):")
print(per["MATH_LEVEL_3CAT"].value_counts())


Muestras por nivel (1c‚Äì6):
MATH_LEVEL_8LAB
1c     508
1b    1905
1a    2177
2     1486
3      687
4      179
5       25
6        1
Name: count, dtype: int64

Muestras por categor√≠a (LOW/MED/HIGH):
MATH_LEVEL_3CAT
LOW     4590
MED     2352
HIGH      26
Name: count, dtype: int64


In [9]:
# --- Conteo simple ---
print("üìä Muestras por nivel (0‚Äì6):")
print(per["MATH_LEVEL_0_6"].value_counts().sort_index())

print("\nüìä Muestras por categor√≠a (LOW / MED / HIGH):")
print(per["MATH_LEVEL_3CAT"].value_counts())


üìä Muestras por nivel (0‚Äì6):
MATH_LEVEL_0_6
0    2413
1    2177
2    1486
3     687
4     179
5      25
6       1
Name: count, dtype: int64

üìä Muestras por categor√≠a (LOW / MED / HIGH):
MATH_LEVEL_3CAT
LOW     4590
MED     2352
HIGH      26
Name: count, dtype: int64


In [10]:
FEATURES_PAPER = [
    # Tabla 5
    "ST001D01T","ST004D01T","ST250Q01JA","ST250Q02JA","ST250Q03JA","ST250Q04JA",
    "ST250Q05JA","ST251Q01JA","ST251Q02JA","ST251Q03JA",
    "ST251Q04JA","ST251Q06JA","ST251Q07JA","ST253Q01JA",
    "ST254Q01JA","ST254Q02JA","ST254Q03JA","ST254Q04JA","ST254Q05JA","ST254Q06JA",
    "ST255Q01JA","ST230Q01JA","ST259Q01JA","ST019AQ01T","ST022Q01TA","ST226Q01JA",
    "ST125Q01NA","ST062Q01TA","ST062Q02TA","ST062Q03TA",
    # Tabla 6
    "ST038Q03NA","ST038Q04NA","ST038Q05NA","ST038Q06NA","ST038Q07NA","ST038Q08NA",
    "ST038Q09JA","ST038Q10JA","ST038Q11JA",
    "ST294Q01JA","ST294Q02JA","ST294Q03JA","ST294Q04JA","ST294Q05JA",
    "ST295Q01JA","ST295Q02JA","ST295Q03JA","ST295Q04JA","ST295Q05JA",
    "ST016Q01NA","ST297Q01JA","ST297Q03JA","ST297Q05JA","ST297Q06JA", "ST297Q07JA","ST297Q09JA",
    "REPEAT","LANGN"
]
feat_avail = [c for c in FEATURES_PAPER if c in per.columns]
X0 = per[feat_avail].copy()
print("Variables seleccionadas:", len(X0.columns))


Variables seleccionadas: 58


In [11]:
# --- CELDA 6: LIMPIEZA B√ÅSICA ---
na_frac_col = X0.isna().mean()
cols_keep = [c for c in X0.columns if na_frac_col[c] < 0.95]
X1 = X0[cols_keep].copy()

nunique_eff = X1.nunique(dropna=True)
cols_keep2 = [c for c in X1.columns if nunique_eff[c] > 1]
X2 = X1[cols_keep2].copy()

row_na_frac = X2.isna().mean(axis=1)
mask_rows = row_na_frac < 0.95
X3 = X2.loc[mask_rows].reset_index(drop=True)

y_aligned = per.loc[mask_rows, ["MATH_MEAN","MATH_LEVEL_0_6","MATH_LEVEL_3CAT"]].reset_index(drop=True)
print("Despu√©s de limpieza ‚Üí X3:", X3.shape, " y:", y_aligned.shape)


Despu√©s de limpieza ‚Üí X3: (6968, 58)  y: (6968, 3)


In [12]:
# --- CELDA 7: TIPOS, IMPUTACI√ìN Y CODIFICACI√ìN (LANGN con OHE) ---

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype

X_proc = X3.copy()
cols = list(X_proc.columns)

# 1) Listas por tipo (solo se incluyen si existen en X3)
BIN_VARS = [c for c in [
    "ST004D01T","REPEAT",
    "ST250Q01JA","ST250Q02JA","ST250Q03JA","ST250Q04JA","ST250Q05JA",
    "ST297Q01JA","ST297Q03JA","ST297Q05JA","ST297Q06JA","ST297Q07JA","ST297Q09JA",
] if c in cols]

SCALAR_VARS = [c for c in ["ST259Q01JA","ST016Q01NA"] if c in cols]

# ‚¨áÔ∏è Aqu√≠ el cambio: incluimos LANGN en OHE_VARS
OHE_VARS = [c for c in ["ST001D01T","ST019AQ01T","ST022Q01TA","LANGN"] if c in cols]

# ‚¨áÔ∏è Sin frequency encoding
HIGH_CARD_NOM = []

ORD_VARS = [c for c in [
    "ST251Q01JA","ST251Q02JA","ST251Q03JA","ST251Q04JA","ST251Q06JA","ST251Q07JA",
    "ST253Q01JA",
    "ST254Q01JA","ST254Q02JA","ST254Q03JA","ST254Q04JA","ST254Q05JA","ST254Q06JA",
    "ST255Q01JA",
    "ST230Q01JA",
    "ST062Q01TA","ST062Q02TA","ST062Q03TA",
    "ST038Q03NA","ST038Q04NA","ST038Q05NA","ST038Q06NA","ST038Q07NA","ST038Q08NA","ST038Q09JA","ST038Q10JA","ST038Q11JA",
    "ST294Q01JA","ST294Q02JA","ST294Q03JA","ST294Q04JA","ST294Q05JA",
    "ST295Q01JA","ST295Q02JA","ST295Q03JA","ST295Q04JA","ST295Q05JA",
    "ST226Q01JA","ST125Q01NA",
] if c in cols]

already = set(BIN_VARS + SCALAR_VARS + OHE_VARS + HIGH_CARD_NOM + ORD_VARS)
OTHER_CAT = [c for c in X_proc.columns if c not in already]

# 2) Imputaci√≥n
# Binarias + nominales (incluye OHE_VARS y OTHER_CAT) ‚Üí moda
mode_cols = BIN_VARS + OHE_VARS + OTHER_CAT
for c in mode_cols:
    moda = X_proc[c].mode(dropna=True)
    X_proc[c] = X_proc[c].fillna(moda.iloc[0] if not moda.empty else 0)

# Ordinales ‚Üí mediana
for c in ORD_VARS:
    X_proc[c] = X_proc[c].fillna(X_proc[c].median())

# Escalares ‚Üí media
for c in SCALAR_VARS:
    X_proc[c] = X_proc[c].fillna(X_proc[c].mean())

# 3) Codificaci√≥n
# OHE (drop_first=True para evitar colinealidad)
if OHE_VARS:
    X_ohe = pd.get_dummies(X_proc[OHE_VARS].astype('string'),
                           prefix=OHE_VARS, drop_first=True)
else:
    X_ohe = pd.DataFrame(index=X_proc.index)

# 4) Ensamble final (conservando num√©ricas existentes + OHE)
num_like = SCALAR_VARS + ORD_VARS + BIN_VARS  # sin _freq
base_df = X_proc[num_like] if num_like else pd.DataFrame(index=X_proc.index)
X_full = pd.concat([base_df, X_ohe], axis=1)

# Tipos binarios a int (sin FutureWarning)
for c in BIN_VARS:
    if c not in X_full.columns:
        continue
    X_full[c] = pd.to_numeric(X_full[c], errors='coerce')
    if X_full[c].isna().any():
        moda = X_full[c].mode(dropna=True)
        X_full[c] = X_full[c].fillna(moda.iloc[0] if not moda.empty else 0)
    vals = set(pd.unique(X_full[c]))
    if vals.issubset({1.0, 2.0, 1, 2}):
        X_full[c] = X_full[c] - 1  # 1‚Üí0, 2‚Üí1
    X_full[c] = X_full[c].round().astype(int)

print("Resumen:")
print(f"  Binarias:        {len(BIN_VARS)}")
print(f"  Ordinales:       {len(ORD_VARS)}")
print(f"  Escalares:       {len(SCALAR_VARS)}")
print(f"  OHE nominales:   {len(OHE_VARS)} ‚Üí columnas nuevas: {X_ohe.shape[1]} (incluye LANGN)")
print(f"  Otras nominales: {len(OTHER_CAT)} (imputadas por moda, sin OHE)")
print("X_full listo:", X_full.shape)


Resumen:
  Binarias:        13
  Ordinales:       39
  Escalares:       2
  OHE nominales:   4 ‚Üí columnas nuevas: 14 (incluye LANGN)
  Otras nominales: 0 (imputadas por moda, sin OHE)
X_full listo: (6968, 68)


In [37]:
# --- CELDA 7.9: GUARDAR DATASET IMPUTADO Y CODIFICADO ---
from pathlib import Path
import pandas as pd

# Detectar ra√≠z y carpeta /processed
def find_root(start=None, max_up=6):
    p = Path.cwd() if start is None else Path(start)
    for _ in range(max_up):
        if (p / "data").exists():
            return p
        p = p.parent
    raise FileNotFoundError("No encontr√© carpeta 'data' hacia arriba (buscando 'data').")

ROOT = find_root()
DATA_PROC = ROOT / "data" / "processed"
DATA_PROC.mkdir(parents=True, exist_ok=True)

# Unir X_full (ya imputado y codificado) con y
per_final = pd.concat([X_full, y_aligned], axis=1)

OUT_FILE = DATA_PROC / "PERU_PISA2022_FINAL_IMPUTADO.xlsx"
per_final.to_excel(OUT_FILE, index=False)

print(f"‚úÖ Archivo completo guardado en: {OUT_FILE.name}")
print("Filas:", per_final.shape[0], "| Columnas:", per_final.shape[1])


‚úÖ Archivo completo guardado en: PERU_PISA2022_FINAL_IMPUTADO.xlsx
Filas: 6968 | Columnas: 71


In [38]:
# --- CELDA 8: GUARDADOS ROBUSTOS (X por partes, y y diccionario de tipos) ---
from pathlib import Path
import math
import pandas as pd
import numpy as np

# Reutiliza ROOT/DATA_PROC si ya existen; si no, los crea.
def find_root(start=None, max_up=6):
    p = Path.cwd() if start is None else Path(start)
    for _ in range(max_up):
        if (p / "data").exists():
            return p
        p = p.parent
    raise FileNotFoundError("No encontr√© carpeta 'data' hacia arriba (buscando 'data').")

ROOT = globals().get("ROOT", find_root())
DATA_PROC = ROOT / "data" / "processed"
DATA_PROC.mkdir(parents=True, exist_ok=True)

# --- Sanidad m√≠nima
assert 'X_full' in globals(), "Falta X_full (ejecuta la Celda 7)."
assert 'y_aligned' in globals(), "Falta y_aligned (sale de la Celda 6)."

# 1) Preparar X para guardar sin mutar el original
X_save = X_full.copy()

# Forzar solo columnas binarias a int (si existen post-procesamiento)
if 'BIN_VARS' in globals():
    for c in BIN_VARS:
        if c in X_save.columns:
            X_save[c] = pd.to_numeric(X_save[c], errors="coerce").fillna(0).astype(int)

# 2) Guardar X en partes (por n√∫mero de columnas)
MAXC = 16000
ncols = X_save.shape[1]
parts = int(math.ceil(ncols / MAXC)) if ncols else 0

for i in range(parts):
    lo, hi = i * MAXC, min((i + 1) * MAXC, ncols)
    out_x = DATA_PROC / f"peru_X_full__part{i+1}.xlsx"
    # OJO: no mutamos X_save, guardamos un slice
    X_save.iloc[:, lo:hi].to_excel(out_x, sheet_name=f"X_{lo+1}-{hi}", index=False)
    print(f"‚úÖ Guard√© X columnas {lo+1}-{hi} en: {out_x.name}")

if parts == 0:
    print("‚ö†Ô∏è X_full no tiene columnas para guardar.")

# 3) Guardar y alineado (tal como lo generaste en Celda 6 / 4)
out_y = DATA_PROC / "y_3niveles.xlsx"
y_aligned.to_excel(out_y, index=False)
print(f"‚úÖ Guard√© y en: {out_y.name}  (shape={y_aligned.shape})")

# 4) Guardar diccionario de tipos (seg√∫n tus listas de la Celda 7)
#    Si alguna lista no existe, la tratamos como vac√≠a.
BIN_VARS       = globals().get("BIN_VARS", [])
ORD_VARS       = globals().get("ORD_VARS", [])
SCALAR_VARS    = globals().get("SCALAR_VARS", [])
OHE_VARS       = globals().get("OHE_VARS", [])
HIGH_CARD_NOM  = globals().get("HIGH_CARD_NOM", [])
OTHER_CAT      = globals().get("OTHER_CAT", [])

rows = []
for c in BIN_VARS:      rows.append((c, "bin"))
for c in ORD_VARS:      rows.append((c, "ordinal"))
for c in SCALAR_VARS:   rows.append((c, "scalar"))
for c in OHE_VARS:      rows.append((c, "nominal_ohe_source"))
for c in HIGH_CARD_NOM: rows.append((c, "nominal_highcard_source"))
for c in OTHER_CAT:     rows.append((c, "nominal_mode_only"))

# si hubo freq-encoding de alta cardinalidad, documentamos la columna resultante
for c in HIGH_CARD_NOM:
    fe = c + "_freq"
    if fe in X_save.columns:
        rows.append((fe, "freq_encoding"))

dicc = pd.DataFrame(rows, columns=["col", "tipo"]).drop_duplicates().sort_values("col")

out_dic = DATA_PROC / "diccionario_vars.xlsx"
dicc.to_excel(out_dic, index=False)
print(f"‚úÖ Guard√© diccionario en: {out_dic.name}")

# 5) Resumen
print("\nResumen final:")
print("X_full (memoria):", X_full.shape)
print(f"Partes guardadas: {parts}")
print("y_aligned:", y_aligned.shape)
print("Tipos ‚Üí",
      f"bin={len(BIN_VARS)} | ordinal={len(ORD_VARS)} | scalar={len(SCALAR_VARS)} |",
      f"ohe_src={len(OHE_VARS)} | highcard_src={len(HIGH_CARD_NOM)} | other_nominal={len(OTHER_CAT)}")
print("Directorio de salida:", DATA_PROC.resolve())


‚úÖ Guard√© X columnas 1-68 en: peru_X_full__part1.xlsx
‚úÖ Guard√© y en: y_3niveles.xlsx  (shape=(6968, 3))
‚úÖ Guard√© diccionario en: diccionario_vars.xlsx

Resumen final:
X_full (memoria): (6968, 68)
Partes guardadas: 1
y_aligned: (6968, 3)
Tipos ‚Üí bin=13 | ordinal=39 | scalar=2 | ohe_src=4 | highcard_src=0 | other_nominal=0
Directorio de salida: C:\Users\Gerson\Downloads\PISA-ML\data\processed
