# Limpieza in-place — World Bank (local)
Si existe `worldbank_sample.csv` (ancho), lo convierte a largo y lo guarda como `worldbank_tidy.csv`.
Si ya existe `worldbank_tidy.csv`, lo limpia in-place.

In [4]:
import pandas as pd, numpy as np, os, re
from pathlib import Path
CANDIDATES = ['worldbank_tidy.csv','worldbank_sample.csv']
target = None
for name in CANDIDATES:
    if os.path.isfile(name):
        target = name
        break
if target is None:
    raise FileNotFoundError('No se encontró worldbank_tidy.csv ni worldbank_sample.csv en este directorio.')
def normalizar_decimales(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = (
                df[c].astype(str)
                .str.replace(r'[^0-9eE.,\-]', '', regex=True)
                .str.replace(',', '.', regex=False)
            )
            df[c] = pd.to_numeric(df[c], errors='coerce')
    return df
df = pd.read_csv(target)
year_like = [c for c in df.columns if re.search(r'(19|20)\d{2} \[YR\d{4}\]', str(c))]
if year_like:
    df.columns = [c.strip() for c in df.columns]
    id_cols = [c for c in df.columns if ('Country' in c or 'Series' in c)]
    tidy = df.melt(id_vars=id_cols, value_vars=year_like, var_name='YearLabel', value_name='Value')
    tidy['Year'] = tidy['YearLabel'].str.extract(r'((?:19|20)\d{2})').astype(float)
    tidy = tidy.rename(columns={
        'Country Name':'country_name','Country Code':'country_code',
        'Series Name':'series_name','Series Code':'series_code'
    })
    tidy['value'] = tidy['Value']
    tidy = tidy[['country_name','country_code','series_name','series_code','Year','value']]
    tidy.columns = [c.lower() for c in tidy.columns]
    tidy = normalizar_decimales(tidy, ['year','value'])
    tidy = tidy.dropna(subset=['country_name','series_name','year','value']).copy()
    tidy['year'] = tidy['year'].astype(int)
    tidy.to_csv('worldbank_tidy.csv', index=False)
    print('Generado → worldbank_tidy.csv')
    tidy.describe(include='all')
else:
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(' ', '_')
        .str.replace(r'[^0-9a-zA-Z_]', '', regex=True)
        .str.lower()
    )
    df = normalizar_decimales(df, ['year','value'])
    df = df.dropna(subset=['country_name','series_name','year','value']).copy()
    df['year'] = df['year'].astype(int)
    df.to_csv('worldbank_tidy.csv', index=False)
    print('Limpio → worldbank_tidy.csv')
    df.describe(include='all')


Limpio → worldbank_tidy.csv
