In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import stats

df_2004 = pd.read_stata('usu_individual_T104.dta', convert_categoricals=False)
df_2024 = pd.read_excel('usu_individual_T124.xlsx')

df_2004.columns = df_2004.columns.str.upper()
df_2024.columns = df_2024.columns.str.upper()
columnas_comunes = df_2004.columns.intersection(df_2024.columns)

df_2004_filtrado = df_2004[columnas_comunes]
df_2024_filtrado = df_2024[columnas_comunes]
df_combinado = pd.concat([df_2004_filtrado, df_2024_filtrado], ignore_index=True)

df_eph_nea = df_combinado[df_combinado['REGION'] == 41].copy()

# Variable EDAD2
if 'CH06' in df_eph_nea.columns:
    df_eph_nea['EDAD2'] = np.square(df_eph_nea['CH06'])
    print("Variable 'edad2' añadida exitosamente.")
else:
    print("No se encontró la columna 'CH06'.")

# Variable EDUC
def calcular_educ(row):
    if row['CH10'] == 1:
        if row['CH14'] in [98, 99]:
            return 6
        if row['CH12'] == 1:
            return 0
        elif row['CH12'] == 2:
            return row['CH14']
        elif row['CH12'] == 3:
            return row['CH14']
        elif row['CH12'] == 4:
            return 6 + row['CH14']
        elif row['CH12'] == 5:
            return 9 + row['CH14']
        elif row['CH12'] == 6:
            return 12 + row['CH14']
        elif row['CH12'] == 7:
            return 12 + row['CH14']
        elif row['CH12'] == 8:
            return 17 + row['CH14']
        elif row['CH12'] == 9:
            return row['CH14']
        else:
            return 0
    elif row['CH10'] == 2:
        if row['CH13'] == 1:
            if row['CH14'] in [98, 99]:
                return 6
            elif row['CH12'] == 1:
                return 0
            elif row['CH12'] == 2:
                return 6
            elif row['CH12'] == 3:
                return 9
            elif row['CH12'] == 4:
                return 12
            elif row['CH12'] == 5:
                return 12
            elif row['CH12'] == 6:
                return 15
            elif row['CH12'] == 7:
                return 17
            elif row['CH12'] == 8:
                return 18
            elif row['CH12'] == 9:
                return 6
            else:
                return 0
        elif row['CH13'] == 2:
            if row['CH14'] in [98, 99]:
                return 6
            elif row['CH12'] == 1:
                return 0
            elif row['CH12'] == 2:
                return row['CH14']
            elif row['CH12'] == 3:
                return row['CH14']
            elif row['CH12'] == 4:
                return 6 + row['CH14']
            elif row['CH12'] == 5:
                return 9 + row['CH14']
            elif row['CH12'] == 6:
                return 12 + row['CH14']
            elif row['CH12'] == 7:
                return 12 + row['CH14']
            elif row['CH12'] == 8:
                return 17 + row['CH14']
            elif row['CH12'] == 9:
                return row['CH14']
            else:
                return 0
    elif row['CH10'] == 3:
        return 0
    else:
        return 0

for col in ['CH10', 'CH12', 'CH13', 'CH14']:
    if col in df_eph_nea.columns:
        df_eph_nea[col] = pd.to_numeric(df_eph_nea[col], errors='coerce')

df_eph_nea['EDUC'] = df_eph_nea.apply(calcular_educ, axis=1)
print("Variable 'educ' añadida exitosamente.")

# Variable SALARIO_SEMANAL
SMVM2004 = 350
SMVM2024 = 202800
pp = round(SMVM2024 / SMVM2004, 2)

if 'ANO4' in df_eph_nea.columns and 'ESTADO' in df_eph_nea.columns and 'P21' in df_eph_nea.columns:
    condiciones_2004 = (df_eph_nea['ANO4'] == 2004) & (df_eph_nea['ESTADO'] == 1) & (df_eph_nea['P21'] > 0)
    condiciones_2024 = (df_eph_nea['ANO4'] == 2024) & (df_eph_nea['ESTADO'] == 1) & (df_eph_nea['P21'] > 0)

    df_eph_nea.loc[condiciones_2004, 'SALARIO_SEMANAL'] = ((df_eph_nea.loc[condiciones_2004, 'P21'] * pp) / 21.65) * 5
    df_eph_nea.loc[condiciones_2024, 'SALARIO_SEMANAL'] = (df_eph_nea.loc[condiciones_2024, 'P21'] / 21.65) * 5
    print("Variable 'salario_semanal' añadida exitosamente.")
else:
    print("Faltan columnas necesarias para calcular 'salario_semanal'.")

# Variable HORASTRAB
variables_a_limpiar = ['PP3E_TOT', 'PP3F_TOT']
for var in variables_a_limpiar:
    if var in df_eph_nea.columns:
        df_eph_nea[var] = df_eph_nea[var].replace([99, 999, 9999], np.nan)

if all(col in df_eph_nea.columns for col in variables_a_limpiar):
    df_eph_nea['HORASTRAB'] = df_eph_nea['PP3E_TOT'] + df_eph_nea['PP3F_TOT']
    print("Variable 'horastrab' añadida exitosamente.")
else:
    print("Faltan columnas necesarias para calcular 'horastrab'.")

df_eph_nea.to_excel('EPH_NEA_2004_2024.xlsx', index=False)
print("Archivo final guardado con todas las variables añadidas.")

respondieron = df_eph_nea[df_eph_nea['ESTADO'].notna() & (df_eph_nea['ESTADO'] != 0)]
norespondieron = df_eph_nea[(df_eph_nea['ESTADO'].isna()) | (df_eph_nea['ESTADO'] == 0)]

respondieron.to_excel('respondieron.xlsx', index=False)
norespondieron.to_excel('norespondieron.xlsx', index=False)

Variable 'edad2' añadida exitosamente.
Variable 'educ' añadida exitosamente.
Variable 'salario_semanal' añadida exitosamente.
Variable 'horastrab' añadida exitosamente.
Archivo final guardado con todas las variables añadidas.


In [43]:
variables = ['CH04', 'EDAD2', 'EDUC', 'CAT_OCUP', 'P21', 'SALARIO_SEMANAL', 'HORASTRAB']

for anio in [2004, 2024]:
    df_anio = respondieron[respondieron['ANO4'] == anio]
    df_anio = df_anio.dropna(subset=variables)
   
    y = (df_anio['ESTADO'] == 2).astype(int).values
    X_df = df_anio[variables].copy()
    X_df.insert(0, 'constante', 1)  # Agregar la columna de unos al inicio
    X = X_df.values  

    #Parto la base en dos y transformo el vector x
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=444)

    columnas = ['constante'] + variables
    test_mean_diff = pd.DataFrame(index=columnas)

    # Calcular estadísticas
    for i, var in enumerate(columnas):
        test_mean_diff.loc[var, 'N train'] = x_train.shape[0]
        test_mean_diff.loc[var, 'Mean train'] = x_train[:, i].mean()
        test_mean_diff.loc[var, 'sd train'] = x_train[:, i].std()

        test_mean_diff.loc[var, 'N test'] = x_test.shape[0]
        test_mean_diff.loc[var, 'Mean test'] = x_test[:, i].mean()
        test_mean_diff.loc[var, 'sd test'] = x_test[:, i].std()

        t_test = stats.ttest_ind(x_train[:, i], x_test[:, i], equal_var=False)
        test_mean_diff.loc[var, 't-test'] = t_test.statistic
        test_mean_diff.loc[var, 'p-value'] = t_test.pvalue

    
    columnas_a_redondear = ['Mean train', 'Mean test', 'sd train', 'sd test', 't-test', 'p-value']
    test_mean_diff[columnas_a_redondear] = test_mean_diff[columnas_a_redondear].round(2)

    
    test_mean_diff.to_excel(f'Tabla de diferencia de medias_{anio}.xlsx')
    print(f"Exportado para el año {anio} desde respondieron")

Exportado para el año 2004 desde respondieron
Exportado para el año 2024 desde respondieron


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
