In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

## Data loading

In [2]:
# Cargar los datos de los archivos proporcionados
features_path = '../data/training_set_features.csv'
labels_path = '../data/training_set_labels.csv'

# Leer los datasets
features = pd.read_csv(features_path)
labels = pd.read_csv(labels_path)

In [3]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [4]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   respondent_id     26707 non-null  int64
 1   h1n1_vaccine      26707 non-null  int64
 2   seasonal_vaccine  26707 non-null  int64
dtypes: int64(3)
memory usage: 626.1 KB


In [5]:
features.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [6]:
labels.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [7]:
# Separar etiquetas y características
labels_h1n1 = labels['h1n1_vaccine']
labels_seasonal = labels['seasonal_vaccine']
features = features.drop(columns=['respondent_id'])  # Excluir ID del análisis

In [8]:
# Identificar columnas categóricas y numéricas
categorical_cols = features.select_dtypes(include=['object']).columns
numeric_cols = features.select_dtypes(include=['float64']).columns

In [9]:
# Imputar los valores faltantes
features_numeric = features[numeric_cols].fillna(features[numeric_cols].mean())

In [10]:
# Imputar los valores faltantes
features_categorical = features[categorical_cols].fillna('most_frequent')

# OneHotEncoder
features_categorical = pd.get_dummies(features, columns=categorical_cols)

In [11]:
features = pd.concat([features_numeric, features_categorical], axis=1)

## H1N1 Vaccine

In [12]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(features, labels_h1n1, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    "n_estimators": [500, 800, 1000],
    "max_depth": [10, 20, 30],
    "min_samples_split": [10, 25, 50],
    "min_samples_leaf": [5, 10, 20]
}

In [14]:
# model_h1n1 = RandomForestClassifier(random_state=42)

# Entrenar el modelo
# model_h1n1.fit(X_train, y_train)

grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
model_h1n1 = grid_search.best_estimator_

In [15]:
# Predecir y calcular AUC-ROC
y_pred_prob = model_h1n1.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_prob)

auc_roc

0.8700919832925733

In [16]:
# Mostrar los mejores parámetros
grid_search.best_params_

{'max_depth': 30,
 'min_samples_leaf': 5,
 'min_samples_split': 25,
 'n_estimators': 1000}

## Seasonal Vaccine

In [17]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(features, labels_seasonal, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    "n_estimators": [500, 800, 1000],
    "max_depth": [10, 20, 30],
    "min_samples_split": [10, 25, 50],
    "min_samples_leaf": [5, 10, 20]
}

In [19]:
# model_seasonal = RandomForestClassifier(random_state=42)

# # Entrenar el modelo
# model_seasonal.fit(X_train, y_train)

grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
model_seasonal = grid_search.best_estimator_

In [20]:
# Predecir y calcular AUC-ROC
y_pred_prob = model_seasonal.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_prob)

auc_roc

0.8617794274525776

In [21]:
# Mostrar los mejores parámetros
grid_search.best_params_

{'max_depth': 30,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 1000}

## Submission

In [22]:
test = pd.read_csv('../data/test_set_features.csv')
test.head()


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [23]:
submission_df = pd.DataFrame(test['respondent_id'])
submission_df.head()

Unnamed: 0,respondent_id
0,26707
1,26708
2,26709
3,26710
4,26711


In [24]:
test_features = test.drop(columns=['respondent_id'])

In [25]:
test_categorical_cols = test_features.select_dtypes(include=['object']).columns
test_numeric_cols = test_features.select_dtypes(include=['float64']).columns

In [26]:
# Imputar los valores faltantes
test_features_numeric = test_features[test_numeric_cols].fillna(test_features[test_numeric_cols].mean())

In [27]:
# Imputar los valores faltantes
test_features_categorical = test_features[test_categorical_cols].fillna('most_frequent')

# OneHotEncoder
test_features_categorical = pd.get_dummies(test_features, columns=test_categorical_cols)

In [28]:
test_features = pd.concat([test_features_numeric, test_features_categorical], axis=1)

In [29]:
# Predecir h1n1_vaccine
h1n1_vaccine = model_h1n1.predict_proba(test_features)[:, 1]

# Predecir seasonal_vaccine
seasonal_vaccine = model_seasonal.predict_proba(test_features)[:, 1]

In [30]:
submission_df['h1n1_vaccine'] = h1n1_vaccine
submission_df['seasonal_vaccine'] = seasonal_vaccine
submission_df.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.19226,0.312318
1,26708,0.044868,0.042786
2,26709,0.245705,0.769937
3,26710,0.62585,0.897557
4,26711,0.332006,0.5494


In [31]:
submission_df.to_csv('submission.csv', index=False)