In [54]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [55]:
def preprocess_data(df, fit_preprocessor=None):

    # 1. Asignar 'Not Applicable' a columnas relacionadas con estado laboral
    df[['employment_industry', 'employment_occupation']] = df[['employment_industry', 'employment_occupation']].mask(
        df['employment_status'].isin(['Not in Labor Force', 'Unemployed']), 'Not Applicable'
    )

    # Definir columnas
    cat_cols = ['race', 'sex', 'marital_status', 'rent_or_own', 'hhs_geo_region',
                'census_msa', 'employment_industry', 'employment_occupation']
    ord_cols = ['age_group', 'education', 'income_poverty', 'employment_status']
    num_cols = df.select_dtypes(include='number').columns.tolist()

    # Orden explícito para las columnas ordinales_str
    ordinal_order = {
        # No hay valores nulos en age_group.
        'age_group': ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'],
        # Categoría missing como educación media.
        'education': ['< 12 Years', '12 Years','Missing', 'Some College', 'College Graduate'],
        # Categoría missing por encima de pobreza y debajo de "encima de pobreza"
        'income_poverty': ['Below Poverty', 'Missing', '<= $75,000, Above Poverty', '> $75,000'],
        # Categoría missing entre desempleados y empleados
        'employment_status': ['Not in Labor Force', 'Unemployed', 'Missing', 'Employed'],
    }
    ordinal_keys = list(ordinal_order.keys())

    # 2. Categorías faltantes en columnas categóricas
    for col in cat_cols + ord_cols:
        df[col] = df[col].fillna('Missing')

    # 3. Indicadores para valores faltantes en columnas numéricas
    for col in num_cols:
        df[f"{col}_is_missing"] = df[col].isnull().astype(int)

    # 4. Imputación extrema en columnas numéricas
    for col in num_cols:
        min_value = df[col].min(skipna=True)  # Evita NaN al calcular el mínimo
        df[col] = df[col].fillna(min_value - 1)

    # 5. Codificación de columnas categóricas y ordinales
    preprocessor = ColumnTransformer(
        transformers=[
            ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
            ('ordinal_encoder', OrdinalEncoder(categories=[ordinal_order[col] for col in ordinal_keys]), ord_cols) 
        ],
        remainder='passthrough',  # Mantener columnas numéricas
        sparse_threshold=0  # Asegurar un DataFrame denso
    )

    if fit_preprocessor is None:
        # Si no hay preprocesador, ajustarlo
        processed_data = preprocessor.fit_transform(df)
        preprocessor_fitted = preprocessor
    else:
        # Usar un preprocesador ajustado previamente
        processed_data = fit_preprocessor.transform(df)
        preprocessor_fitted = fit_preprocessor

    # Reconstruir nombres de columnas
    categorical_columns = preprocessor_fitted.named_transformers_['categorical_encoder'].get_feature_names_out(cat_cols)
    ordinal_columns = ord_cols
    remaining_columns = [col for col in df.columns if col not in cat_cols + ord_cols]
    output_columns = list(categorical_columns) + ordinal_columns + remaining_columns

    # Convertir a DataFrame
    processed_df = pd.DataFrame(processed_data, columns=output_columns, index=df.index)

    return processed_df, preprocessor_fitted

In [56]:
# Cargar datos
data = pd.read_csv('Data/training_set_features.csv', index_col="respondent_id")
labels = pd.read_csv('Data/training_set_labels.csv', index_col="respondent_id")

In [57]:
# Dividir datos antes del preprocesamiento
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

In [58]:
# Preprocesar X_train y ajustar el preprocesador
X_train_preprocessed, preprocessor = preprocess_data(X_train.copy())

# Preprocesar X_test usando el mismo preprocesador
X_test_preprocessed, _ = preprocess_data(X_test.copy(), fit_preprocessor=preprocessor)

# Guardar los DataFrames preprocesados
X_train_preprocessed.to_csv('Data/preprocessed_X_train.csv', index=True)
X_test_preprocessed.to_csv('Data/preprocessed_X_test.csv', index=True)
y_train.to_csv('Data/preprocessed_y_train.csv', index=True)
y_test.to_csv('Data/preprocessed_y_test.csv', index=True)

In [59]:
# Preprocesar el conjunto de prueba externo
test_set_features = pd.read_csv("Data/test_set_features.csv", index_col="respondent_id")
test_set_preprocessed, _ = preprocess_data(test_set_features.copy(), fit_preprocessor=preprocessor)

# Guardar el conjunto de prueba preprocesado
test_set_preprocessed.to_csv("Data/preprocessed_test_set_features.csv", index=True)

In [60]:
# 1. Dimensiones del dataset
print("Dimensiones del dataset:")
print(f"Filas: {X_train_preprocessed.shape[0]}, Columnas: {X_train_preprocessed.shape[1]}")

# 2. Información del dataset
print("\nInformación general del dataset:")
print(X_train_preprocessed.info())

# 3. Valores faltantes
print("\nValores faltantes por columna (porcentaje):")
missing_values = X_train_preprocessed.isna().mean() * 100
print(missing_values)

Dimensiones del dataset:
Filas: 21365, Columnas: 123

Información general del dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 21365 entries, 12230 to 467
Columns: 123 entries, race_Black to household_children_is_missing
dtypes: float64(123)
memory usage: 20.2 MB
None

Valores faltantes por columna (porcentaje):
race_Black                                0.0
race_Hispanic                             0.0
race_Other or Multiple                    0.0
race_White                                0.0
sex_Female                                0.0
                                         ... 
opinion_seas_vacc_effective_is_missing    0.0
opinion_seas_risk_is_missing              0.0
opinion_seas_sick_from_vacc_is_missing    0.0
household_adults_is_missing               0.0
household_children_is_missing             0.0
Length: 123, dtype: float64
