# Análisis e inserción de los datos económicos por país

### 1. Preparación del entorno y conexión


In [1]:
import pandas as pd
import numpy as np
import pymysql
from pymysql.constants import CLIENT
from dotenv import load_dotenv
import os

load_dotenv()
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_NAME = os.getenv("DB_NAME")

# Conexión a MySQL
conexion = pymysql.connect(
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASSWORD,
    database=DB_NAME,
    client_flag=CLIENT.MULTI_STATEMENTS
)
cursor = conexion.cursor()


### 2. Carga del Excel y exploración


In [2]:
xls = pd.ExcelFile("../../data/P_Data_Extract_From_World_Development_Indicators.xlsx")
df_gdp = pd.read_excel("../../data/API_NY.GDP.MKTP.CD_DS2_en_excel_v2_85096.xls",skiprows=3)

# Cargamos la hoja 'Data'
df_raw = xls.parse("Data")
df_raw.head()


Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,Afghanistan,AFG,Carbon intensity of GDP (kg CO2e per 2021 PPP ...,EN.GHG.CO2.RT.GDP.PP.KD,..,..,..,..,..,..,...,0.083662,0.086847,0.077308,0.081098,0.076226,0.067211,0.050538,0.06575,0.070581,..
1,Afghanistan,AFG,Carbon intensity of GDP (kg CO2e per constant ...,EN.GHG.CO2.RT.GDP.KD,..,..,..,..,..,..,...,0.438997,0.455707,0.405653,0.425544,0.399975,0.352672,0.265188,0.345006,0.370357,..
2,Afghanistan,AFG,"Electricity production from oil, gas and coal ...",EG.ELC.FOSL.ZS,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
3,Afghanistan,AFG,"Electricity production from renewable sources,...",EG.ELC.RNWX.ZS,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
4,Afghanistan,AFG,"Electricity production from renewable sources,...",EG.ELC.RNWX.KH,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..


### 3. Selección de los 6 indicadores clave


In [3]:
# Lista de Series Code que nos interesan
indic_codes = [
    "EN.GHG.CO2.RT.GDP.PP.KD",
    "EN.GHG.CO2.RT.GDP.KD",
    "EG.ELC.FOSL.ZS",
    "EG.ELC.RNWX.ZS",
    "EG.ELC.RNWX.KH",
    "EG.ELC.RNEW.ZS"
]

df_ind = df_raw[df_raw["Series Code"].isin(indic_codes)].copy()
df_ind.shape, df_ind["Series Code"].unique()

df_gdp = df_gdp.rename(columns={
    "Country Name": "Country",
    "Country Code": "CountryCode",
    "Indicator Name": "Indicator",
    "Indicator Code": "IndicatorCode"
})
year_cols = [c for c in df_gdp.columns if c.isdigit()]
print("Años detectados:", year_cols[:5], "...", year_cols[-5:])



Años detectados: ['1960', '1961', '1962', '1963', '1964'] ... ['2020', '2021', '2022', '2023', '2024']


### 4. Transformación a formato largo (un año por fila)


In [4]:
import re

# 4.1) Detectar los year_cols de df_ind (vienen con sufijo “[YR…]”)
year_cols_ind = [c for c in df_ind.columns if re.match(r"^\d{4} \[YR\d{4}\]$", c)]
print("Años en indicadores (formato df_ind):", year_cols_ind[:5], "...", year_cols_ind[-3:])

# 4.2) Melt para los indicadores co2, energía, etc.
df_l = df_ind.melt(
    id_vars=["Country Name", "Country Code", "Series Name", "Series Code"],
    value_vars=year_cols_ind,
    var_name="Year",
    value_name="Value"
)
# extraer solo el año numérico
df_l["Year"] = df_l["Year"].str.extract(r"(\d{4})").astype(int)
df_l.head(3)


# 4.3) Detectar los year_cols de df_gdp
# en este API vienen como columnas puramente numéricas "1960","1961",…
year_cols_gdp = [c for c in df_gdp.columns if re.match(r"^\d{4}$", c)]
print("Años en GDP (formato df_gdp):", year_cols_gdp[:5], "...", year_cols_gdp[-5:])

# 4.4) Melt para el GDP
df_gdp_long = df_gdp.melt(
    id_vars=["CountryCode", "Country", "IndicatorCode", "Indicator"],
    value_vars=year_cols_gdp,
    var_name="Year",
    value_name="GDP"
)
# convertir GDP a numérico y quitar nulos
df_gdp_long["GDP"] = pd.to_numeric(df_gdp_long["GDP"], errors="coerce")
df_gdp_long = df_gdp_long.dropna(subset=["GDP"])
df_gdp_long["Year"] = df_gdp_long["Year"].astype(int)
df_gdp_long.head(3)


Años en indicadores (formato df_ind): ['1960 [YR1960]', '1961 [YR1961]', '1962 [YR1962]', '1963 [YR1963]', '1964 [YR1964]'] ... ['2021 [YR2021]', '2022 [YR2022]', '2023 [YR2023]']
Años en GDP (formato df_gdp): ['1960', '1961', '1962', '1963', '1964'] ... ['2020', '2021', '2022', '2023', '2024']


Unnamed: 0,CountryCode,Country,IndicatorCode,Indicator,Year,GDP
1,AFE,Africa Eastern and Southern,NY.GDP.MKTP.CD,GDP (current US$),1960,24210630000.0
3,AFW,Africa Western and Central,NY.GDP.MKTP.CD,GDP (current US$),1960,11904950000.0
13,AUS,Australia,NY.GDP.MKTP.CD,GDP (current US$),1960,18607680000.0


### 5. Limpieza de los valores numéricos
En esta celda limpiamos ambas columnas de valor (`Value` para indicadores y `GDP` para PIB):
- Reemplazamos `".."` y `"-"` por cadena vacía.
- Convertimos a numérico, forzando `NaN` cuando no sea posible.
- Filtramos las filas sin valor.

In [5]:
# 5) Limpiar tanto df_l como df_gdp_long
for df, col in [(df_l, "Value"), (df_gdp_long, "GDP")]:
    # 5.1) Reemplazar marcadores de faltante
    df[col] = (
        df[col]
          .astype(str)
          .replace({"..": "", "-": ""}, regex=False)
    )
    # 5.2) Forzar a numérico
    df[col] = pd.to_numeric(df[col], errors="coerce")
    # 5.3) Eliminar filas sin valor
    n_missing = df[col].isna().sum()
    print(f"⚠️ {n_missing} filas sin valor en columna `{col}` → se descartan")
    df.dropna(subset=[col], inplace=True)


⚠️ 51052 filas sin valor en columna `Value` → se descartan
⚠️ 0 filas sin valor en columna `GDP` → se descartan


#### 6. Normalización del nombre de país y mapeo a `pais_id`  
En esta celda:
1. Creamos la columna `country_norm` en minúsculas y sin espacios extra.
2. Aplicamos nuestro diccionario de excepciones para coincidir con `nombre_en` en la base.
3. Hacemos `map` usando la dimensión de países cargada desde MySQL.
4. Listamos los que siguen sin mapeo y filtramos.


In [6]:
# 6.1) Normalizar nombre original de país
df_l["country_norm"] = (
    df_l["Country Name"]
       .str.strip()
       .str.lower()
)
df_gdp_long["country_norm"] = (
    df_gdp_long["Country"]
      .str.strip()
      .str.lower()
)

# 6.2) Cargar dimensión Paises (codigo, nombre_en)
cursor.execute("SELECT codigo, nombre_en FROM Paises;")
dim_p = {
    nombre_en.strip().lower(): codigo
    for codigo, nombre_en in cursor.fetchall()
}

# 6.3) Diccionario de excepciones para `country_norm`
exceptions = {
    'netherlands':                    'netherlands (kingdom of the)',
    'turkiye':                        'türkiye',
    'united kingdom':                 'united kingdom of great britain and northern ireland',
    'bahamas, the':                   'bahamas',
    'bolivia':                        'bolivia (plurinational state of)',
    'congo, dem. rep.':               'congo (the democratic republic of the)',
    'congo, rep.':                    'congo',
    "cote d'ivoire":                  "côte d'ivoire",
    'egypt, arab rep.':               'egypt',
    'gambia, the':                    'gambia',
    'hong kong sar, china':           'hong kong',
    'iran, islamic rep.':             'iran (islamic republic of)',
    'korea, rep.':                    'korea (the republic of)',
    'micronesia, fed. states of':     'micronesia (federated states of)',
    'st. vincent and the grenadines': 'saint vincent and the grenadines',
    'tanzania':                       'tanzania, the united republic of',
    'curacao':                        'curaçao',
    "korea, dem. people's rep.":     "korea (the democratic people's republic of)",
    'slovak republic':                'slovakia',
    'venezuela, rb':                  'venezuela (bolivarian republic of)',
    'yemen, rep.':                    'yemen',
    'st. kitts and nevis':            'saint kitts and nevis',
    'st. lucia':                      'saint lucia',
    'macao sar, china':               'macao',
    'lao pdr':                        "lao people's democratic republic",
    'kyrgyz republic':                'kyrgyzstan',
    'russian federation':             'russian federation',
    'moldova':                        'moldova (the republic of)',
    'united states':                  'united states of america',
    'st. martin (french part)':       'saint martin (french part)',
    'british virgin islands':         'virgin islands (british)',
}

# 6.4) Aplicar excepciones y crear columna intermedia
df_l["country_db"]       = df_l["country_norm"].map(lambda x: exceptions.get(x, x))
df_gdp_long["country_db"]= df_gdp_long["country_norm"].map(lambda x: exceptions.get(x, x))

# 6.5) Mapear a 'pais_id' y listar no mapeados
df_l["pais_id"]        = df_l["country_db"].map(dim_p)
df_gdp_long["pais_id"] = df_gdp_long["country_db"].map(dim_p)

no_map_l   = df_l.loc[df_l["pais_id"].isna(), "Country Name"].unique()
no_map_gdp = df_gdp_long.loc[df_gdp_long["pais_id"].isna(), "Country"].unique()
print("⚠️ Países sin código en indicadores:", no_map_l)
print("⚠️ Países sin código en GDP:", no_map_gdp)

# 6.6) Filtrar solo los que sí tienen pais_id
df_l        = df_l[df_l["pais_id"].notna()]
df_gdp_long = df_gdp_long[df_gdp_long["pais_id"].notna()]


⚠️ Países sin código en indicadores: ['Central Europe and the Baltics' 'Early-demographic dividend'
 'East Asia & Pacific' 'Euro area' 'Europe & Central Asia'
 'Europe & Central Asia (excluding high income)'
 'Europe & Central Asia (IDA & IBRD countries)' 'European Union'
 'High income' 'IBRD only' 'IDA & IBRD total' 'Late-demographic dividend'
 'Low & middle income' 'Middle income' 'North America' 'OECD members'
 'Other small states' 'Post-demographic dividend' 'Small states'
 'Upper middle income' 'World' 'Micronesia, Fed. Sts.'
 'Africa Eastern and Southern' 'Africa Western and Central' 'Arab World'
 'Caribbean small states' 'East Asia & Pacific (excluding high income)'
 'East Asia & Pacific (IDA & IBRD countries)'
 'Fragile and conflict affected situations'
 'Heavily indebted poor countries (HIPC)' 'IDA blend' 'IDA only'
 'IDA total' 'Latin America & Caribbean'
 'Latin America & Caribbean (excluding high income)'
 'Latin America & the Caribbean (IDA & IBRD countries)'
 'Least devel

### 9. Inserción de hechos económicos (indicadores y PIB)

En esta celda vamos a:
1. Preparar y ejecutar el batch‐insert de los hechos de **indicadores** a partir de `df_l`.  
2. Preparar y ejecutar el batch‐insert de los hechos de **PIB** a partir de `df_gdp_long`.  
3. Usar lotes de 1 000 filas para no sobrecargar la base de datos.
. Insertar en DimIndicadores los 6 indicadores


In [7]:
# —————————————————————————————————————————————————————————
# 9) Insertar todos los hechos en la tabla “Hechos”
# —————————————————————————————————————————————————————————

# 9.1) Leemos el mapping indicador code → indicador_id
cursor.execute("SELECT id, codigo FROM Indicadores;")
dim_ind_map = { code: _id for _id, code in cursor.fetchall() }

# 9.2) Preparar batch de indicadores climáticos (df_l)
batch_hechos = []
for _, row in df_l.iterrows():
    batch_hechos.append((
        row["pais_id"],                       # pais_id
        17,                                    # periodo_id (siempre anual)
        int(row["Year"]),                      # anio
        dim_ind_map[row["Series Code"]],       # indicador_id
        float(row["Value"])                    # valor
    ))

# 9.3) Preparar batch de PIB (df_gdp_long)
for _, row in df_gdp_long.iterrows():
    batch_hechos.append((
        row["pais_id"],                        # pais_id
        17,                                     # periodo_id
        int(row["Year"]),                       # anio
        dim_ind_map[row["IndicatorCode"]],      # indicador_id para PIB
        float(row["GDP"])                       # valor del PIB
    ))

# 9.4) Ejecutar INSERT en “Hechos” por lotes de 1000
sql_insert_hechos = """
INSERT INTO Hechos
  (pais_id, periodo_id, anio, indicador_id, valor)
VALUES (%s, %s, %s, %s, %s);
"""
batch_size = 1000
total = len(batch_hechos)
print(f"Total de hechos a insertar: {total}")
for start in range(0, total, batch_size):
    chunk = batch_hechos[start:start+batch_size]
    cursor.executemany(sql_insert_hechos, chunk)
    conexion.commit()
    print(f"  ✔ Filas insertadas {start+1}–{min(start+batch_size, total)}")

# 9.5) Cerrar cursor y conexión
cursor.close()
conexion.close()
print("✅ Todos los hechos han sido insertados correctamente en la tabla Hechos.")


Total de hechos a insertar: 50036
  ✔ Filas insertadas 1–1000
  ✔ Filas insertadas 1001–2000
  ✔ Filas insertadas 2001–3000
  ✔ Filas insertadas 3001–4000
  ✔ Filas insertadas 4001–5000
  ✔ Filas insertadas 5001–6000
  ✔ Filas insertadas 6001–7000
  ✔ Filas insertadas 7001–8000
  ✔ Filas insertadas 8001–9000
  ✔ Filas insertadas 9001–10000
  ✔ Filas insertadas 10001–11000
  ✔ Filas insertadas 11001–12000
  ✔ Filas insertadas 12001–13000
  ✔ Filas insertadas 13001–14000
  ✔ Filas insertadas 14001–15000
  ✔ Filas insertadas 15001–16000
  ✔ Filas insertadas 16001–17000
  ✔ Filas insertadas 17001–18000
  ✔ Filas insertadas 18001–19000
  ✔ Filas insertadas 19001–20000
  ✔ Filas insertadas 20001–21000
  ✔ Filas insertadas 21001–22000
  ✔ Filas insertadas 22001–23000
  ✔ Filas insertadas 23001–24000
  ✔ Filas insertadas 24001–25000
  ✔ Filas insertadas 25001–26000
  ✔ Filas insertadas 26001–27000
  ✔ Filas insertadas 27001–28000
  ✔ Filas insertadas 28001–29000
  ✔ Filas insertadas 29001–3000