In [11]:
# Limpieza y estandarizaci√≥n de datasets crudos
# ===============================

from kedro.framework.session import KedroSession
import pandas as pd
import numpy as np
import os

# 1) Inicializar Kedro y cat√°logo
session = KedroSession.create("..")
context = session.load_context()
catalog = context.catalog
print("Kedro listo ‚úÖ")


Kedro listo ‚úÖ


In [12]:
# 2) Cargar crudos desde el cat√°logo
#    (usa las claves que definimos en conf/base/catalog.yml)
df_pc_countries = catalog.load("pib_per_capita_countries")
df_pc_orgs      = catalog.load("pib_per_capita_organizations")
df_life_raw     = catalog.load("life_expectancy")

print(
    f"pib_per_capita_countries: {df_pc_countries.shape} | "
    f"pib_per_capita_organizations: {df_pc_orgs.shape} | "
    f"life_expectancy: {df_life_raw.shape}"
)



pib_per_capita_countries: (13760, 10) | pib_per_capita_organizations: (3264, 9) | life_expectancy: (21565, 4)


In [13]:
# 3) Normalizar PIB per c√°pita (countries)
def clean_pib_per_capita_countries(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Renombrar a est√°ndar
    df = df.rename(columns={
        "country_code": "country_code",
        "country_name": "country_name",
        "region": "region",
        "sub_region": "sub_region",
        "intermediate_region": "intermediate_region",
        "year": "year",
        "gdp_per_capita": "gdp_per_capita",
        "gdp_variation": "gdp_variation",
    })

    # Tipos
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    for col in ["gdp_per_capita", "gdp_variation"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Normalizaci√≥n de pa√≠s
    df["country_name"] = df["country_name"].astype(str).str.strip()
    df["country_norm"] = df["country_name"].str.upper()

    # Filtrar filas v√°lidas
    df = df.dropna(subset=["year", "gdp_per_capita", "country_name"])

    # Columnas ordenadas
    keep_cols = [
        "country_code", "country_name", "country_norm",
        "region", "sub_region", "intermediate_region",
        "year", "gdp_per_capita", "gdp_variation"
    ]
    df = df[[c for c in keep_cols if c in df.columns]]

    return df

In [14]:
# 4) Normalizar PIB per c√°pita (organizations / agregados)
def clean_pib_per_capita_orgs(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df = df.rename(columns={
        "country_name": "country_name",
        "country_code": "country_code",
        "region": "region",
        "income_group": "income_group",
        "year": "year",
        "gdp_per_capita": "gdp_per_capita",
        "gdp_variation": "gdp_variation",
    })

    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    for col in ["gdp_per_capita", "gdp_variation"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df["country_name"] = df["country_name"].astype(str).str.strip()
    df["country_norm"] = df["country_name"].str.upper()

    df = df.dropna(subset=["year", "gdp_per_capita", "country_name"])

    # Alinear columnas con countries; sub_region/intermediate pueden faltar
    for missing in ["sub_region", "intermediate_region"]:
        if missing not in df.columns:
            df[missing] = np.nan

    keep_cols = [
        "country_code", "country_name", "country_norm",
        "region", "sub_region", "intermediate_region",
        "year", "gdp_per_capita", "gdp_variation", "income_group"
    ]
    df = df[[c for c in keep_cols if c in df.columns]]

    return df


gdp_countries_clean = clean_pib_per_capita_countries(df_pc_countries)
gdp_orgs_clean      = clean_pib_per_capita_orgs(df_pc_orgs)

# Concatenar (pa√≠ses + agregados). Si no quieres agregados, comenta la l√≠nea del concat.
gdp_all = pd.concat([gdp_countries_clean, gdp_orgs_clean], ignore_index=True).drop_duplicates()

print("GDP per c√°pita limpio (all) ->", gdp_all.shape)
display(gdp_all.head(3))


GDP per c√°pita limpio (all) -> (17024, 10)


Unnamed: 0,country_code,country_name,country_norm,region,sub_region,intermediate_region,year,gdp_per_capita,gdp_variation,income_group
0,AFG,AFGANISTAN,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,1960,0.0,0.0,
1,AFG,AFGANISTAN,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,1961,0.0,0.0,
2,AFG,AFGANISTAN,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,1962,0.0,0.0,


In [15]:
# 5) Normalizar Esperanza de Vida
def clean_life(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df = df.rename(columns={
        "Entity": "country_life",
        "Code": "code",
        "Year": "year",
        "Period life expectancy at birth": "life_expectancy"
    })

    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df["life_expectancy"] = pd.to_numeric(df["life_expectancy"], errors="coerce")

    df["country_life"] = df["country_life"].astype(str).str.strip()
    df["country_norm"] = df["country_life"].str.upper()

    df = df.dropna(subset=["year", "life_expectancy", "country_life"])

    keep_cols = ["country_life", "code", "country_norm", "year", "life_expectancy"]
    df = df[keep_cols]

    return df


life_clean = clean_life(df_life_raw)
print("Life expectancy limpio ->", life_clean.shape)
display(life_clean.head(3))



Life expectancy limpio -> (21565, 5)


Unnamed: 0,country_life,code,country_norm,year,life_expectancy
0,Afghanistan,AFG,AFGHANISTAN,1950,28.1563
1,Afghanistan,AFG,AFGHANISTAN,1951,28.5836
2,Afghanistan,AFG,AFGHANISTAN,1952,29.0138


In [19]:
# ==============================================
# 7Ô∏è‚É£ Crear merged_data (PIB per c√°pita + Esperanza de vida)
# ==============================================

import pandas as pd
import os

# 1Ô∏è‚É£ Cargar los datasets limpios de la fase anterior
gdp_clean = pd.read_csv("../data/02_intermediate/gdp_cleaned.csv")
life_clean = pd.read_csv("../data/02_intermediate/life_cleaned.csv")

print("üìä Columnas PIB limpio:", list(gdp_clean.columns))
print("üìä Columnas Vida limpio:", list(life_clean.columns))

# 2Ô∏è‚É£ Normalizar nombres para merge
# --- Para GDP (PIB) ---
if "country_name" in gdp_clean.columns:
    gdp_clean["country_norm"] = gdp_clean["country_name"].astype(str).str.strip().str.upper()
elif "country" in gdp_clean.columns:
    gdp_clean["country_norm"] = gdp_clean["country"].astype(str).str.strip().str.upper()
else:
    raise KeyError("No se encontr√≥ una columna de pa√≠s en el dataset de PIB.")

# --- Para Esperanza de vida ---
possible_cols = ["country", "country_name", "Entity", "country_life"]
col_country = next((c for c in possible_cols if c in life_clean.columns), None)
if col_country is None:
    raise KeyError("No se encontr√≥ una columna de pa√≠s en el dataset de Esperanza de Vida.")

life_clean["country_norm"] = life_clean[col_country].astype(str).str.strip().str.upper()

# 3Ô∏è‚É£ Hacer merge por pa√≠s y a√±o
merged = pd.merge(
    gdp_clean,
    life_clean,
    on=["country_norm", "year"],
    how="inner"
)

# 4Ô∏è‚É£ Crear columna est√°ndar de target
if "life_expectancy" in merged.columns:
    merged["period_life_expectancy_at_birth"] = merged["life_expectancy"]
elif "Period life expectancy at birth" in merged.columns:
    merged["period_life_expectancy_at_birth"] = merged["Period life expectancy at birth"]
else:
    raise KeyError("No se encontr√≥ la columna de esperanza de vida en el merge.")

# 5Ô∏è‚É£ Confirmar resultado
print("‚úÖ merged_data construido correctamente. Shape:", merged.shape)
display(merged.head())

# 6Ô∏è‚É£ Guardar en 03_primary
out_dir = "../data/03_primary"
os.makedirs(out_dir, exist_ok=True)
out_path = f"{out_dir}/merged_data.csv"
merged.to_csv(out_path, index=False)

print(f"üíæ Guardado en: {out_path}")


üìä Columnas PIB limpio: ['country_code', 'country_name', 'country_norm', 'region', 'sub_region', 'intermediate_region', 'year', 'gdp_per_capita', 'gdp_variation', 'income_group']
üìä Columnas Vida limpio: ['country_life', 'code', 'country_norm', 'year', 'life_expectancy']
‚úÖ merged_data construido correctamente. Shape: (6976, 14)


Unnamed: 0,country_code,country_name,country_norm,region,sub_region,intermediate_region,year,gdp_per_capita,gdp_variation,income_group,country_life,code,life_expectancy,period_life_expectancy_at_birth
0,ALB,ALBANIA,ALBANIA,EUROPA Y ASIA CENTRAL (EXCLUIDO ALTOS INGRESOS),SOUTHERN EUROPE,,1960,0.0,0.0,,Albania,ALB,56.4132,56.4132
1,ALB,ALBANIA,ALBANIA,EUROPA Y ASIA CENTRAL (EXCLUIDO ALTOS INGRESOS),SOUTHERN EUROPE,,1961,0.0,0.0,,Albania,ALB,57.488,57.488
2,ALB,ALBANIA,ALBANIA,EUROPA Y ASIA CENTRAL (EXCLUIDO ALTOS INGRESOS),SOUTHERN EUROPE,,1962,0.0,0.0,,Albania,ALB,58.4944,58.4944
3,ALB,ALBANIA,ALBANIA,EUROPA Y ASIA CENTRAL (EXCLUIDO ALTOS INGRESOS),SOUTHERN EUROPE,,1963,0.0,0.0,,Albania,ALB,59.4792,59.4792
4,ALB,ALBANIA,ALBANIA,EUROPA Y ASIA CENTRAL (EXCLUIDO ALTOS INGRESOS),SOUTHERN EUROPE,,1964,0.0,0.0,,Albania,ALB,60.4035,60.4035


üíæ Guardado en: ../data/03_primary/merged_data.csv
