In [None]:
# === 1. Importar librerías ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt

# === 2. Cargar el archivo original (por ejemplo el CSV de 2024T3 o todos los trimestrales combinados) ===
raw_path = r"C:\Users\tomif\OneDrive\Escritorio\TP_IAD\data\raw"
file = f"{raw_path}\\usu_individual_T324.txt"   # elegí el más reciente
df = pd.read_csv(file, sep=";", encoding="latin-1", usecols=cols)
df.columns = [c.upper() for c in df.columns]

# === 3. Filtrar aglomerados y limpiar ===
df = df[df["AGLOMERADO"].isin([13,32])].copy()

# === 4. Seleccionar variables necesarias ===
cols = ["P21","AGLOMERADO","CH06","CH04","NIVEL_ED","CAT_OCUP","PONDERA"]
modelo_df = df[cols].copy()

# Convertir a numérico
for c in cols:
    modelo_df[c] = pd.to_numeric(modelo_df[c], errors="coerce")

# Filtrar válidos
modelo_df = modelo_df.dropna(subset=["CH06","CH04","NIVEL_ED","CAT_OCUP"])

# === 5. Crear variable ingreso real simple ===
# (no deflactamos acá, solo usamos el P21 nominal)
modelo_df = modelo_df[modelo_df["P21"] > 0].copy()

# === 6. Dividir en entrenamiento y prueba ===
X = modelo_df[["AGLOMERADO","CH06","CH04","NIVEL_ED","CAT_OCUP"]]
y = modelo_df["P21"]
modelo_df = modelo_df.sample(3000, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === 7. Entrenar modelo ===
modelo = LinearRegression()
modelo.fit(X_train, y_train)

# === 8. Evaluar ===
y_pred = modelo.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"R² = {r2:.3f}")
print(f"MAE = {mae:,.0f}")

# === 9. Coeficientes ===
coef_df = pd.DataFrame({
    "Variable": X.columns,
    "Coeficiente": modelo.coef_
}).sort_values("Coeficiente", ascending=False)
print(coef_df)

# === 10. Gráfico predicho vs observado ===
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.4, edgecolor="k")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Ingreso observado (P21)")
plt.ylabel("Ingreso predicho")
plt.title("Modelo de regresión lineal múltiple — imputación de ingresos (EPH)")
plt.grid(True, linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.show()
