# 01 - Data Engineering: Ingesta y Limpieza

**Objetivo:** Descargar, limpiar y preparar los datos para análisis.

**Fuentes:**
- FRED API: CPI, Federal Funds Rate, Oil Price
- World Bank: Gold Price (1960-2025)

**Output:** `data/processed/dataset_modelo.csv`

## 1. Setup

In [None]:
# Librerías
import pandas as pd
import numpy as np
import requests
import os
from dotenv import load_dotenv

# Directorio de trabajo
os.chdir("/Users/javiermondragon/Documents/data_projects/inflation-predictor")

# Cargar API key
load_dotenv()
api_key = os.getenv("FRED_API_KEY")
print(f"API Key cargada: {api_key[:5]}...")

# Crear carpetas si no existen
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)
print("Setup completo")

## 2. Descarga de Datos

### 2.1 Series de FRED

In [None]:
# Definir series de FRED
series_fred = {
    "CPIAUCSL": "cpi",
    "FEDFUNDS": "fed_rate",
    "DCOILWTICO": "oil_price"
}

url = "https://api.stlouisfed.org/fred/series/observations"

# Verificar cada serie
print("Verificando series...\n")
for codigo, nombre in series_fred.items():
    params = {
        "series_id": codigo,
        "api_key": api_key,
        "file_type": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()
    
    if "observations" in data:
        print(f"✓ {codigo}: {len(data['observations'])} registros")
    else:
        print(f"✗ {codigo}: ERROR - {data.get('error_message')}")

In [None]:
# Descargar y guardar datos crudos
print("Descargando datos...\n")

for codigo, nombre in series_fred.items():
    params = {
        "series_id": codigo,
        "api_key": api_key,
        "file_type": "json"
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    
    df = pd.DataFrame(data["observations"])
    df.to_csv(f"data/raw/{nombre}_raw.csv", index=False)
    
    print(f"✓ {nombre}_raw.csv - {len(df)} filas")

print("\nDatos de FRED guardados")

### 2.2 Oro del World Bank

In [None]:
# Cargar oro del World Bank
# NOTA: Debes copiar CMO-Historical-Data-Monthly.xlsx a data/raw/

df_gold_wb = pd.read_excel('data/raw/CMO-Historical-Data-Monthly.xlsx', 
                           sheet_name='Monthly Prices', 
                           header=4)

# Procesar
df_gold_wb = df_gold_wb.rename(columns={'Unnamed: 0': 'date'})
df_gold_wb = df_gold_wb[['date', 'Gold']].copy()
df_gold_wb = df_gold_wb.iloc[1:]  # Quitar fila de unidades

# Convertir tipos
df_gold_wb['date'] = pd.to_datetime(df_gold_wb['date'].str.replace('M', '-'), format='%Y-%m')
df_gold_wb['Gold'] = pd.to_numeric(df_gold_wb['Gold'], errors='coerce')
df_gold_wb = df_gold_wb.rename(columns={'Gold': 'value'})

# Guardar
df_gold_wb.to_csv('data/raw/gold_price_raw.csv', index=False)

print(f"✓ gold_price_raw.csv - {len(df_gold_wb)} filas")
print(f"  Rango: {df_gold_wb['date'].min().strftime('%Y-%m')} a {df_gold_wb['date'].max().strftime('%Y-%m')}")

## 3. Perfilamiento de Datos

In [None]:
# Cargar todos los datasets
datasets = {
    'cpi': pd.read_csv('data/raw/cpi_raw.csv'),
    'fed_rate': pd.read_csv('data/raw/fed_rate_raw.csv'),
    'oil_price': pd.read_csv('data/raw/oil_price_raw.csv'),
    'gold_price': pd.read_csv('data/raw/gold_price_raw.csv')
}

# Perfil de cada dataset
print("PERFIL DE DATOS CRUDOS")
print("=" * 60)

for nombre, df in datasets.items():
    print(f"\n{nombre.upper()}")
    print(f"  Filas: {len(df)}")
    print(f"  Columnas: {list(df.columns)}")

## 4. Identificar Problemas

In [None]:
# Buscar valores problemáticos en series de FRED
print("IDENTIFICACIÓN DE PROBLEMAS")
print("=" * 60)

archivos_fred = ['cpi', 'fed_rate', 'oil_price']

for nombre in archivos_fred:
    df = datasets[nombre]
    
    print(f"\n{nombre.upper()}")
    print(f"  Fechas: {df['date'].min()} a {df['date'].max()}")
    
    # Valores no numéricos
    numerico = pd.to_numeric(df['value'], errors='coerce')
    no_numericos = df[numerico.isna()]['value'].unique()
    print(f"  Valores no numéricos: {no_numericos if len(no_numericos) > 0 else 'Ninguno'}")
    print(f"  Cantidad: {len(df[numerico.isna()])}")

# Gold (ya procesado)
print(f"\nGOLD_PRICE")
df_gold = datasets['gold_price']
print(f"  Fechas: {df_gold['date'].min()} a {df_gold['date'].max()}")
print(f"  Nulos: {df_gold['value'].isna().sum()}")

## 5. Limpieza

In [None]:
def limpiar_serie_fred(ruta, nombre_columna):
    """
    Limpia una serie de FRED:
    - Selecciona columnas date y value
    - Reemplaza "." por NaN
    - Convierte tipos
    - Renombra columna value
    """
    df = pd.read_csv(ruta)
    df = df[['date', 'value']].copy()
    df['value'] = df['value'].replace('.', pd.NA)
    df['date'] = pd.to_datetime(df['date'])
    df['value'] = pd.to_numeric(df['value'])
    df = df.rename(columns={'value': nombre_columna})
    return df

def limpiar_serie_gold(ruta):
    """
    Limpia la serie de oro del World Bank.
    """
    df = pd.read_csv(ruta)
    df['date'] = pd.to_datetime(df['date'])
    df['value'] = pd.to_numeric(df['value'])
    df = df.rename(columns={'value': 'gold_price'})
    return df

In [None]:
# Limpiar cada serie
df_cpi = limpiar_serie_fred('data/raw/cpi_raw.csv', 'cpi')
df_fed = limpiar_serie_fred('data/raw/fed_rate_raw.csv', 'fed_rate')
df_oil = limpiar_serie_fred('data/raw/oil_price_raw.csv', 'oil_price')
df_gold = limpiar_serie_gold('data/raw/gold_price_raw.csv')

print("Series limpiadas:")
print(f"  CPI: {len(df_cpi)} filas, {df_cpi['cpi'].isna().sum()} nulos")
print(f"  FED: {len(df_fed)} filas, {df_fed['fed_rate'].isna().sum()} nulos")
print(f"  OIL: {len(df_oil)} filas, {df_oil['oil_price'].isna().sum()} nulos")
print(f"  GOLD: {len(df_gold)} filas, {df_gold['gold_price'].isna().sum()} nulos")

## 6. Unificar Frecuencias

In [None]:
# Ver frecuencias actuales
print("Frecuencias actuales:")
print(f"  CPI: Mensual ({df_cpi['date'].min().year}-{df_cpi['date'].max().year})")
print(f"  FED: Mensual ({df_fed['date'].min().year}-{df_fed['date'].max().year})")
print(f"  OIL: Diaria ({df_oil['date'].min().year}-{df_oil['date'].max().year})")
print(f"  GOLD: Mensual ({df_gold['date'].min().year}-{df_gold['date'].max().year})")

In [None]:
# Convertir OIL de diario a mensual (promedio)
df_oil_mensual = df_oil.set_index('date').resample('MS').mean().reset_index()

print(f"OIL convertido: {len(df_oil)} diarios → {len(df_oil_mensual)} mensuales")

## 7. Unir Datasets

In [None]:
# Unir todas las series
df_final = df_cpi.copy()
df_final = df_final.merge(df_fed, on='date', how='outer')
df_final = df_final.merge(df_oil_mensual, on='date', how='outer')
df_final = df_final.merge(df_gold, on='date', how='outer')

# Ordenar por fecha
df_final = df_final.sort_values('date').reset_index(drop=True)

print(f"Dataset unido: {len(df_final)} filas")
print(f"Rango: {df_final['date'].min()} a {df_final['date'].max()}")
print(f"\nNulos por columna:")
print(df_final.isna().sum())

In [None]:
# Filtrar datos completos
df_modelo = df_final.dropna().reset_index(drop=True)

print(f"Dataset final: {len(df_modelo)} filas con datos completos")
print(f"Desde: {df_modelo['date'].min()}")
print(f"Hasta: {df_modelo['date'].max()}")

## 8. Guardar Dataset Procesado

In [None]:
# Guardar
df_modelo.to_csv('data/processed/dataset_modelo.csv', index=False)

print("✓ Guardado: data/processed/dataset_modelo.csv")
print(f"\nResumen:")
print(f"  Filas: {len(df_modelo)}")
print(f"  Columnas: {list(df_modelo.columns)}")
print(f"  Período: {df_modelo['date'].min().strftime('%Y-%m')} a {df_modelo['date'].max().strftime('%Y-%m')}")

In [None]:
# Vista previa
df_modelo.head()

In [None]:
df_modelo.tail()