In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

## Data loading

In [None]:
# Cargar los datos de los archivos proporcionados
features_path = '../data/training_set_features.csv'
labels_path = '../data/training_set_labels.csv'

# Leer los datasets
features = pd.read_csv(features_path)
labels = pd.read_csv(labels_path)

In [None]:
features.info()

In [None]:
labels.info()

In [None]:
features.head()

In [None]:
labels.head()

In [None]:
# Separar etiquetas y características
labels_h1n1 = labels['h1n1_vaccine']
labels_seasonal = labels['seasonal_vaccine']
features = features.drop(columns=['respondent_id'])  # Excluir ID del análisis

In [None]:
# Identificar columnas categóricas y numéricas
categorical_cols = features.select_dtypes(include=['object']).columns
numeric_cols = features.select_dtypes(include=['float64']).columns

In [None]:
# Imputar los valores faltantes
features_numeric = features[numeric_cols].fillna(features[numeric_cols].mean())

In [None]:
# Imputar los valores faltantes
features_categorical = features[categorical_cols].fillna('most_frequent')

# OneHotEncoder
features_categorical = pd.get_dummies(features, columns=categorical_cols)

In [None]:
features = pd.concat([features_numeric, features_categorical], axis=1)

## H1N1 Vaccine

In [None]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(features, labels_h1n1, test_size=0.2, random_state=42)

In [None]:
model_h1n1 = RandomForestClassifier(random_state=42)

# Entrenar el modelo
model_h1n1.fit(X_train, y_train)

In [None]:
# Predecir y calcular AUC-ROC
y_pred_prob = model_h1n1.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_prob)

auc_roc

## Seasonal Vaccine

In [None]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(features, labels_seasonal, test_size=0.2, random_state=42)

In [None]:
model_seasonal = RandomForestClassifier(random_state=42)

# Entrenar el modelo
model_seasonal.fit(X_train, y_train)

In [None]:
# Predecir y calcular AUC-ROC
y_pred_prob = model_seasonal.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_prob)

auc_roc

## Submission

In [None]:
test = pd.read_csv('../data/test_set_features.csv')
test.head()


In [None]:
submission_df = pd.DataFrame(test['respondent_id'])
submission_df.head()

In [None]:
test_features = test.drop(columns=['respondent_id'])

In [None]:
test_categorical_cols = test_features.select_dtypes(include=['object']).columns
test_numeric_cols = test_features.select_dtypes(include=['float64']).columns

In [None]:
# Imputar los valores faltantes
test_features_numeric = test_features[test_numeric_cols].fillna(test_features[test_numeric_cols].mean())

In [None]:
# Imputar los valores faltantes
test_features_categorical = test_features[test_categorical_cols].fillna('most_frequent')

# OneHotEncoder
test_features_categorical = pd.get_dummies(test_features, columns=test_categorical_cols)

In [None]:
test_features = pd.concat([test_features_numeric, test_features_categorical], axis=1)

In [None]:
# Predecir h1n1_vaccine
h1n1_vaccine = model_h1n1.predict_proba(test_features)[:, 1]

# Predecir seasonal_vaccine
seasonal_vaccine = model_seasonal.predict_proba(test_features)[:, 1]

In [None]:
submission_df['h1n1_vaccine'] = h1n1_vaccine
submission_df['seasonal_vaccine'] = seasonal_vaccine
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)