# Deduplicar maestro_picks

Carga `df_maestro/maestro_picks.pkl`, muestra duplicados y permite guardarlo limpio con backup.

In [1]:
import pandas as pd
from pathlib import Path

def resolve_maestro() -> Path | None:
    cwd = Path.cwd()
    for p in [cwd] + list(cwd.parents):
        cand = p / "/home/pablo/Documentos/footystats/df_maestro/maestro_picks.pkl"
        if cand.exists():
            return cand
    return None

MAESTRO_PATH = resolve_maestro()
if MAESTRO_PATH is None:
    raise FileNotFoundError(f"No se encontró maestro_picks.pkl desde {Path.cwd()} hacia arriba")

df = pd.read_pickle(MAESTRO_PATH)
df.shape, MAESTRO_PATH

((390, 47),
 PosixPath('/home/pablo/Documentos/footystats/df_maestro/maestro_picks.pkl'))

In [2]:
# Detectar duplicados con estas claves (ajusta si lo necesitas)
dedup_keys = ["Hora", "Match_id", "Mercado"]
dedup_keys = [k for k in dedup_keys if k in df.columns]

df_for_dups = df.copy()
dedup_subset = dedup_keys.copy()

if "Hora" in dedup_keys:
    # Normaliza Hora para comparar duplicados aunque tenga formatos distintos
    df_for_dups["_Hora_norm"] = pd.to_datetime(df_for_dups["Hora"], errors="coerce")
    dedup_subset = ["_Hora_norm" if k == "Hora" else k for k in dedup_subset]

if "Match_id" in dedup_keys:
    # Normaliza Match_id (string vs int)
    df_for_dups["_Match_id_norm"] = df_for_dups["Match_id"].astype("string").str.strip()
    dedup_subset = ["_Match_id_norm" if k == "Match_id" else k for k in dedup_subset]

if "Mercado" in dedup_keys:
    # Normaliza Mercado para evitar duplicados por espacios/case
    df_for_dups["_Mercado_norm"] = df_for_dups["Mercado"].astype("string").str.strip().str.upper()
    dedup_subset = ["_Mercado_norm" if k == "Mercado" else k for k in dedup_subset]

dup_mask = (
    df_for_dups.duplicated(subset=dedup_subset, keep=False)
    if dedup_subset
    else pd.Series(False, index=df.index)
)
dups = df.loc[dup_mask].sort_values(dedup_keys) if dedup_subset else pd.DataFrame()
print(f"Claves usadas: {dedup_keys}")
if "Hora" in dedup_keys:
    print("Nota: Hora se normaliza a timestamp para detectar duplicados.")
if "Match_id" in dedup_keys:
    print("Nota: Match_id se normaliza a string para evitar int vs str.")
if "Mercado" in dedup_keys:
    print("Nota: Mercado se normaliza con strip/upper para evitar espacios/case.")
print(f"Duplicados detectados: {dup_mask.sum()}")
print()
dups.head(50)


Claves usadas: ['Hora', 'Match_id', 'Mercado']
Nota: Hora se normaliza a timestamp para detectar duplicados.
Nota: Match_id se normaliza a string para evitar int vs str.
Nota: Mercado se normaliza con strip/upper para evitar espacios/case.
Duplicados detectados: 14



Unnamed: 0,Corners_total,Cuota_Justa,Estado,Estado_EXE,Fecha_ejecucion,GPT_valido,Goles_OTeam,Hora,ID_partido,Liga,...,awayGoals_today,awayID,away_id,competition_id,homeGoals_today,homeID,home_id,market_group,season_id,season_label
369,11.0,,VERDE,,2026-01-06,True,,2026-01-06 14:00,,England Premier League,...,,211.0,211,15050,,153.0,153,OVER,15050,
25,,,PENDIENTE,,2026-01-06,True,,2026-01-06 14:00,,England Premier League,...,,211.0,211,15050,,153.0,153,OVER,15050,
386,,,LIVE,LIVE,2026-01-07,True,,2026-01-07 11:30,,Italy Serie A,...,,473.0,473,15068,,74.0,74,GOLHT,15068,
389,,2.2,LIVE,PENDIENTE,2026-01-07,True,,2026-01-07 11:30,8238743.0,Italy Serie A,...,,,473,15068,,,74,TEAM_OVER,15068,2025/2026
379,,,LIVE,LIVE,2026-01-07,True,,2026-01-07 11:30,,Italy Serie A,...,,473.0,473,15068,,74.0,74,GOLHT,15068,
382,,2.2,PENDIENTE,PENDIENTE,2026-01-07,True,,2026-01-07 11:30,8238743.0,Italy Serie A,...,,,473,15068,,,74,TEAM_OVER,15068,2025/2026
383,,,PENDIENTE,PENDIENTE,2026-01-07,True,,2026-01-07 13:30,,England Premier League,...,,223.0,223,15050,,144.0,144,BTTS,15050,
384,,,PENDIENTE,PENDIENTE,2026-01-07,True,,2026-01-07 13:30,,England Premier League,...,,209.0,209,15050,,93.0,93,BTTS,15050,
388,,1.75,PENDIENTE,PENDIENTE,2026-01-07,True,,2026-01-07 13:30,8223561.0,England Premier League,...,,,209,15050,,,93,TEAM_OVER,15050,2025/2026
385,,,PENDIENTE,PENDIENTE,2026-01-07,True,,2026-01-07 13:30,,England Premier League,...,,152.0,152,15050,,162.0,162,BTTS,15050,


In [3]:
# Guardar sin duplicados (ajusta el switch SAVE_CHANGES)
SAVE_CHANGES = True  # pon True para escribir cambios
if SAVE_CHANGES and dedup_subset:
    backup = MAESTRO_PATH.with_suffix(MAESTRO_PATH.suffix + ".bak")
    MAESTRO_PATH.rename(backup)
    dup_keep_last = df_for_dups.duplicated(subset=dedup_subset, keep="last")
    df_clean = df.loc[~dup_keep_last]
    df_clean.to_pickle(MAESTRO_PATH)
    print(f"✅ Guardado sin duplicados: {len(df)} -> {len(df_clean)} filas. Backup: {backup}")
elif SAVE_CHANGES:
    print("No hay claves de dedupe en el maestro; no se guardó nada.")
else:
    print("Guardar desactivado (cambia SAVE_CHANGES=True para escribir).")


✅ Guardado sin duplicados: 390 -> 383 filas. Backup: /home/pablo/Documentos/footystats/df_maestro/maestro_picks.pkl.bak
