In [None]:
import gc
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
# Paquetes nuestros
import models
import process_data

pd.set_option('display.max_columns', None)

data = fr"D:\Maestría\Machine Learning\TP2\data"

# Query data

In [None]:
test_hogares_df = pd.read_csv(fr'{data}\test_hogares.csv')
test_personas_df = pd.read_csv(fr'{data}\test_personas.csv')
train_hogares_df = pd.read_csv(fr'{data}\train_hogares.csv')
train_personas_df = pd.read_csv(fr'{data}\train_personas.csv')

In [None]:
test_hogares_df, train_hogares_df = process_data.limpia_columnas_hogares(test_hogares_df, train_hogares_df)
test_personas_df, train_personas_df = process_data.limpia_columnas_personas(test_personas_df, train_personas_df)

# Visualizations

In [None]:
import pandas as pd

# Creo un DataFrame para almacenar los resultados
resultados_df = pd.DataFrame(columns=['Dominio', '% Pobreza', '% Indigencia', 'Ingreso Promedio', 'Desvío del Ingreso'])

# Itero sobre cada region
for dominio in dominios:
    subset = train_hogares_df[train_hogares_df['Dominio'] == dominio]
    
    # Calculo el porcentaje de hogares por debajo de la línea de pobreza e indigencia
    porcentaje_pobreza = (subset['Pobre'] == 1).mean() * 100
    porcentaje_indigencia = (subset['Indigente'] == 1).mean() * 100
    
    # Calculo el ingreso promedio y desvío estándar del ingreso
    ingreso_promedio = subset['Ingtotug'].mean()
    desvio_ingreso = subset['Ingtotug'].std()
    
    # Agrego los resultados al DataFrame
    resultados_df = resultados_df.append({
        'Dominio': dominio,
        '% Pobreza': porcentaje_pobreza,
        '% Indigencia': porcentaje_indigencia,
        'Ingreso Promedio': f"${ingreso_promedio:,.2f}",  # Formatear el ingreso promedio como número
        'Desvío del Ingreso': desvio_ingreso
    }, ignore_index=True)

# Guardao la tabla de resultados en un archivo Excel
resultados_df.to_excel("resultados_pobreza.xlsx", index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Creo subconjuntos de datos para cada region
dominios = train_hogares_df['Dominio'].unique()

# Calculo el número de filas y columnas para graficar
num_filas = len(dominios) // 5 + len(dominios) % 5  
num_columnas = 5

# Creo los subplots
fig, axs = plt.subplots(num_filas, num_columnas, figsize=(20, 5 * num_filas), sharex=True)
#Ajusto ejes
if num_filas == 1:
    axs = axs.reshape(1, -1)

for i, dominio in enumerate(dominios):
    fila = i // num_columnas
    columna = i % num_columnas

    subset = train_hogares_df[train_hogares_df['Dominio'] == dominio]

    #Calculo el log
    subset['Ingtotug_log'] = np.log1p(subset['Ingtotug'])
    subset['Li_log'] = np.log1p(subset['Li'])
    subset['Lp_log'] = np.log1p(subset['Lp'])

    # Grafico
    sns.histplot(subset['Ingtotug_log'], bins=30, kde=True, label='Ingreso', ax=axs[fila, columna])
    axs[fila, columna].axvline(subset['Li_log'].iloc[0], color='orange', linestyle='dashed', linewidth=2, label='Li')
    axs[fila, columna].axvline(subset['Lp_log'].iloc[0], color='red', linestyle='dashed', linewidth=2, label='Lp')

    axs[fila, columna].set_title(f'{dominio}')
    axs[fila, columna].set_xlabel('Ingreso (log)')
    axs[fila, columna].set_ylabel('Frecuencia')
    axs[fila, columna].legend()
plt.tight_layout()

plt.show()

# Run models

In [None]:
paras = {
    'splits': 5,
    'ridge': {
        "alpha": 0.5 
    },
    'lgb': {
        'max_depth': None,
        'lr': 0.01,
        'feature_fraction': 0.07,
        'verbos_': -1,
        'classification': True,
    },
    'xgb': {
        'eta': 0.1,
        'max_depth': 4,
        'subsample': 0.75,
        'colsample_by_tree': 0.07,
        'verbos_': 500,
    },
    'nn': {
        'nn_l1': 300,
        'nn_l2': 300,
        'epochs': 20,
        'batch': 64,
        'dp': 0.,
        'classification': True,
    },
    'w_cla_xgb': 0.33,
    'w_cla_lgb': 0.33,
    'w_cla_nn': 0.34,
    'w_reg_ridge': 0.33,
    'w_reg_lgb': 0.33,
    'w_reg_nn': 0.34,
}


In [None]:
# Dataset A cla
X, y, X_test = process_data.build_dataset_A(train_personas_df, train_hogares_df, test_personas_df, test_hogares_df, nan="dummies", cla=True)
claA_loss, claA_preds, preds = models.train_cla_models(X, y, paras, test_ = X_test)

# Dataset B cla
X, y, X_test = process_data.build_dataset_B(train_personas_df, train_hogares_df, test_personas_df, test_hogares_df, nan="dummies", cla=True)
claB_loss, claB_preds, preds = models.train_cla_models(X, y, paras, test_ = X_test)

# Dataset B cla
X, y, X_test = process_data.build_dataset_C(train_personas_df, train_hogares_df, test_personas_df, test_hogares_df, nan="dummies", cla=True)
claC_loss, claC_preds, preds = models.train_cla_models(X, y, paras, test_ = X_test)

In [None]:
# Dataset A reg
X, y, X_test = process_data.build_dataset_A(train_personas_df, train_hogares_df, test_personas_df, test_hogares_df, nan="dummies", cla=False)
regA_loss, regA_preds, preds_regs = models.train_reg_models(X, y, paras, test_ = X_test)

# Dataset B reg
X, y, X_test = process_data.build_dataset_B(train_personas_df, train_hogares_df, test_personas_df, test_hogares_df, nan="dummies", cla=False)
regB_loss, regB_preds, preds_regs = models.train_reg_models(X, y, paras, test_ = X_test)

# Dataset C reg
X, y, X_test = process_data.build_dataset_C(train_personas_df, train_hogares_df, test_personas_df, test_hogares_df, nan="dummies", cla=False)
regC_loss, regC_preds, preds_regs = models.train_reg_models(X, y, paras, test_ = X_test)


In [None]:
def export_submits(preds, name):
    preds_submit = pd.Series(preds<=X_test["Lp"]).value_counts()
    pd.concat([test_hogares_df.id, preds_submit], axis=1).rename(columns={0:"pobre"}).to_csv(f"{name}.csv", index=False)
    
runs = {'claA_preds':claA_preds, 'claB_preds':claB_preds, 'claC_preds':claC_preds, 'regA_preds':regA_preds, 'regB_preds':regB_preds, 'regC_preds':regC_preds}
for name, run in runs.items():
    for model in run.keys():
        export_submits(run[model], name=name)
