In [4]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.4.0
Note: you may need to restart the kernel to use updated packages.




In [5]:
import pandas as pd
import requests
import unidecode

# ================================================================
# 1️⃣ CARGAR DATOS DESDE LA API (igual que en cargar_datos.py)
# ================================================================
def load_data_from_api(limit: int = 50000) -> pd.DataFrame:
    api_url = f"https://www.datos.gov.co/resource/nudc-7mev.json?$limit={limit}"
    response = requests.get(api_url)
    response.raise_for_status()
    data = response.json()
    df = pd.DataFrame(data)
    return df

df_raw = load_data_from_api(5000)  # prueba con 5000 para no saturar
print(f"✅ Datos cargados: {df_raw.shape}")
display(df_raw.head())

# ================================================================
# 2️⃣ FUNCIONES DE LIMPIEZA
# ================================================================
def normalizar_texto(texto: str) -> str:
    if pd.isna(texto):
        return texto
    return unidecode.unidecode(texto.strip().lower())

def corregir_departamentos(df: pd.DataFrame) -> pd.DataFrame:
    reemplazos = {
        "bogota d.c": "bogota",
        "bogotá d.c.": "bogota",
        "bogota": "bogota",
        "valle del cauca": "valle",
        "san andres, providencia y santa catalina": "san andres",
        "archipielago de san andres": "san andres"
    }
    df["departamento"] = df["departamento"].apply(lambda x: reemplazos.get(x, x))
    return df

def limpiar_metricas(df: pd.DataFrame) -> pd.DataFrame:
    for col in ["tasa_matriculaci_n_5_16", "cobertura_neta", "cobertura_bruta"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
            df.loc[df[col] < 0, col] = None
            df.loc[df[col] > 100, col] = 100
    return df

# ================================================================
# 3️⃣ LIMPIEZA Y NORMALIZACIÓN
# ================================================================
columnas_relevantes = [
    'a_o', 'departamento', 'municipio', 'c_digo_departamento',
    'poblaci_n_5_16', 'tasa_matriculaci_n_5_16',
    'cobertura_neta', 'cobertura_bruta'
]
df = df_raw[columnas_relevantes].dropna(how="all")

# Normalizar texto
for col in ["departamento", "municipio"]:
    df[col] = df[col].astype(str).apply(normalizar_texto)

df = corregir_departamentos(df)
df = limpiar_metricas(df)
df = df.drop_duplicates()

print("✅ Datos limpios:", df.shape)
display(df.head(10))

# ================================================================
# 4️⃣ DIMENSIONES Y TABLA DE HECHOS
# ================================================================
def crear_dimension(df, cols, nombre, sort_col=None):
    dim = df[cols].drop_duplicates()
    if sort_col:
        dim = dim.sort_values(by=sort_col)
    dim = dim.reset_index(drop=True)
    dim[f"id_{nombre}"] = dim.index + 1
    return dim[[f"id_{nombre}"] + cols]

dim_tiempo = crear_dimension(df, ['a_o'], 'tiempo', sort_col='a_o')
dim_geo = crear_dimension(df, ['c_digo_departamento', 'departamento', 'municipio'], 'geo', sort_col='c_digo_departamento')

print("Dim Tiempo:", dim_tiempo.shape)
display(dim_tiempo.head())

print("Dim Geografía:", dim_geo.shape)
display(dim_geo.head())

df_fact = df.merge(dim_tiempo, on='a_o') \
            .merge(dim_geo, on=['departamento', 'municipio', 'c_digo_departamento'], how='inner')

print("✅ Tabla de hechos:", df_fact.shape)
display(df_fact.head())


✅ Datos cargados: (5000, 39)


Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n,reprobaci_n_transici_n,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media
0,2023,11001,"Bogotá, D.C.",11,"Bogotá, D.C.",3766,"Bogotá, D.C. (ETC)",1141573,92.9,92.4,...,7.78,0.37,5.44,12.57,6.5,7.55,1.66,7.11,10.98,3.2
1,2023,19532,Patía,19,Cauca,3777,Cauca (ETC),7165,80.99,80.99,...,6.78,0.41,5.68,10.7,4.55,9.07,3.7,9.84,11.86,2.78
2,2023,47170,Chibolo,47,Magdalena,3794,Magdalena (ETC),5773,84.65,84.6,...,0.02,0.0,0.0,0.0,0.16,9.67,18.86,9.93,9.8,1.59
3,2023,68235,El Carmen de Chucurí,68,Santander,3808,Santander (ETC),4711,63.09,63.04,...,4.93,0.87,2.15,10.22,3.77,6.89,2.16,6.8,9.72,2.32
4,2023,63302,Génova,63,Quindio,3803,Quindio (ETC),1194,88.44,88.44,...,9.98,1.12,4.95,19.59,5.13,8.6,2.25,8.56,12.21,3.21


✅ Datos limpios: (5000, 8)


Unnamed: 0,a_o,departamento,municipio,c_digo_departamento,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,cobertura_bruta
0,2023,"bogota, d.c.","bogota, d.c.",11,1141573,92.9,92.4,100.0
1,2023,cauca,patia,19,7165,80.99,80.99,93.2
2,2023,magdalena,chibolo,47,5773,84.65,84.6,100.0
3,2023,santander,el carmen de chucuri,68,4711,63.09,63.04,70.09
4,2023,quindio,genova,63,1194,88.44,88.44,99.58
5,2023,huila,la argentina,41,3302,84.65,84.65,95.46
6,2023,antioquia,peque,5,1900,75.32,75.32,83.11
7,2023,magdalena,cerro san antonio,47,2365,77.08,77.08,91.25
8,2023,cauca,almaguer,19,3445,60.58,60.58,75.85
9,2023,santander,guapota,68,458,75.98,75.98,84.72


Dim Tiempo: (5, 2)


Unnamed: 0,id_tiempo,a_o
0,1,2019
1,2,2020
2,3,2021
3,4,2022
4,5,2023


Dim Geografía: (1274, 4)


Unnamed: 0,id_geo,c_digo_departamento,departamento,municipio
0,1,0,nacional,nacional
1,2,5,antioquia,amalfi
2,3,5,antioquia,granada
3,4,5,antioquia,marinilla
4,5,5,antioquia,san pedro de uraba


✅ Tabla de hechos: (5000, 10)


Unnamed: 0,a_o,departamento,municipio,c_digo_departamento,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,cobertura_bruta,id_tiempo,id_geo
0,2023,"bogota, d.c.","bogota, d.c.",11,1141573,92.9,92.4,100.0,5,151
1,2023,cauca,patia,19,7165,80.99,80.99,93.2,5,378
2,2023,magdalena,chibolo,47,5773,84.65,84.6,100.0,5,682
3,2023,santander,el carmen de chucuri,68,4711,63.09,63.04,70.09,5,1048
4,2023,quindio,genova,63,1194,88.44,88.44,99.58,5,948
