# Filtro POLE/MMR y p53

Este notebook carga un **CSV** (o **Excel**) y filtra las filas donde:

- `mut_pole == 2` (elimina NA y otros valores)
- `msh2 == 0`, `msh6 == 0`, `pms2 == 0`, `mlh1 == 0`
- (`p53_molecular == 2`) **o** (`p53_ihq == 1`) *(si cumple uno, ya es válido)*


In [1]:
# Si te faltan dependencias (por ejemplo para Excel):
# %pip install pandas openpyxl

from pathlib import Path

import pandas as pd


In [5]:
# Cambia este path por tu CSV (o deja el Excel de ejemplo del repo)
DATA_PATH = Path("Dades/IQ_Cancer_Endometrio_merged_NMSP.xlsx")

# Solo aplica si el archivo es Excel
SHEET_NAME = "IQ_Cancer_Endometrio_merged_NMS"

suffix = DATA_PATH.suffix.lower()
if suffix == ".csv":
    df = pd.read_csv(DATA_PATH)
elif suffix in {".xlsx", ".xls"}:
    df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)
else:
    raise ValueError(f"Formato no soportado: {suffix}")

df.shape


(163, 189)

In [6]:
REQUIRED_COLS = [
    "mut_pole",
    "msh2",
    "msh6",
    "pms2",
    "mlh1",
    "p53_molecular",
    "p53_ihq",
]

missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
    raise KeyError(f"Faltan columnas en el dataset: {missing}")

# Convierte a numérico; valores no-numéricos pasan a NaN
df = df.copy()
for c in REQUIRED_COLS:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df[REQUIRED_COLS].head()


Unnamed: 0,mut_pole,msh2,msh6,pms2,mlh1,p53_molecular,p53_ihq
0,2.0,0.0,0.0,0.0,0.0,2.0,1.0
1,3.0,2.0,2.0,2.0,2.0,3.0,3.0
2,2.0,0.0,0.0,0.0,0.0,3.0,1.0
3,3.0,0.0,0.0,0.0,0.0,2.0,1.0
4,2.0,0.0,0.0,0.0,0.0,3.0,1.0


In [7]:
mask = (
    (df["mut_pole"] == 2)
    & (df["msh2"] == 0)
    & (df["msh6"] == 0)
    & (df["pms2"] == 0)
    & (df["mlh1"] == 0)
    & ((df["p53_molecular"] == 2) | (df["p53_ihq"] == 1))
)

df_filtrado = df.loc[mask].copy()
df_filtrado.shape


(100, 189)

In [8]:
# Vista rápida de las columnas usadas en el filtro
df_filtrado[REQUIRED_COLS].head(10)


Unnamed: 0,mut_pole,msh2,msh6,pms2,mlh1,p53_molecular,p53_ihq
0,2.0,0.0,0.0,0.0,0.0,2.0,1.0
2,2.0,0.0,0.0,0.0,0.0,3.0,1.0
4,2.0,0.0,0.0,0.0,0.0,3.0,1.0
6,2.0,0.0,0.0,0.0,0.0,2.0,1.0
7,2.0,0.0,0.0,0.0,0.0,3.0,1.0
8,2.0,0.0,0.0,0.0,0.0,2.0,1.0
9,2.0,0.0,0.0,0.0,0.0,3.0,1.0
10,2.0,0.0,0.0,0.0,0.0,3.0,1.0
12,2.0,0.0,0.0,0.0,0.0,3.0,1.0
14,2.0,0.0,0.0,0.0,0.0,2.0,1.0


In [9]:
# Guarda el resultado
OUT_CSV = Path("Dades/filtrado.csv")
df_filtrado.to_csv(OUT_CSV, index=False)
OUT_CSV


WindowsPath('Dades/filtrado.csv')