In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [None]:
label = pd.read_csv('../data/training_set_labels.csv')
label.head()

In [None]:
train = pd.read_csv('../data/training_set_features.csv')
train['h1n1_target'] = label['h1n1_vaccine']
train['seasonal_target'] = label['seasonal_vaccine']
train.head()

In [None]:
test = pd.read_csv('../data/test_set_features.csv')
test.head()

In [None]:
submission_df = pd.DataFrame(test['respondent_id'])
submission_df.head()

## Data preprocessing

In [None]:
train.drop(columns=['respondent_id'], inplace=True)
test.drop(columns=['respondent_id'], inplace=True)

In [None]:
# One hot encoding in categorical columns in 0 and 1
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Transformar True y False en 1 y 0
train = train * 1
test = test * 1

In [None]:
# Comprobar las características más importantes según un Random Forest
from sklearn.ensemble import RandomForestClassifier

# Crear un clasificador de Random Forest
rf = RandomForestClassifier()

# Entrenar el modelo
rf.fit(train.drop(['h1n1_target', 'seasonal_target'], axis=1), train['h1n1_target'])

# Obtener la importancia de las características
importances = rf.feature_importances_

# Crear un DataFrame con las características y su importancia
feature_importances = pd.DataFrame({'feature': train.drop(['h1n1_target', 'seasonal_target'], axis=1).columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)

# Mostrar las características que sobrepasen un umbral
threshold = 0.02
selected_features = feature_importances[feature_importances['importance'] > threshold]

In [None]:
# Seleccionar las características en el conjunto de entrenamiento y de prueba
train = train[list(selected_features['feature'])+ ['h1n1_target', 'seasonal_target']]
test = test[list(selected_features['feature'])]

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
# # Eliminar columnas con valores nulos
# train = train.dropna()
# test = test.dropna()

# Imputar valores nulos con la media para las columnas numéricas
train = train.fillna(train.mean())
test = test.fillna(test.mean())

In [None]:
X_train = train.drop(['h1n1_target', 'seasonal_target'], axis=1)
y_train = train[['h1n1_target', 'seasonal_target']]
X_test = test

In [None]:
print("El tamaño de X_train es: ", X_train.shape)   
print("El tamaño de y_train es: ", y_train.shape)
print("El tamaño de X_test es: ", X_test.shape)

## Neural Network

In [None]:
import tensorflow as tf

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=[X_train.shape[1]]),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

### H1N1 Vaccine

In [None]:
X_train = np.array(X_train)
y_train_h1n1 = np.array(y_train['h1n1_target'])

In [None]:
X_train.shape

In [None]:
y_train_h1n1.shape

In [None]:
from sklearn.model_selection import train_test_split

# Dividir X_train y y_train_seasonal en datos de entrenamiento y validación
X_train_split, X_val_split, y_train_h1n1_split, y_val_h1n1_split = train_test_split(X_train, y_train_h1n1, test_size=0.2, random_state=42)

model.fit(X_train_split, y_train_h1n1_split,
          validation_data=(X_val_split, y_val_h1n1_split),
          epochs=15)

In [None]:
# Predict
predictions = model.predict(X_test)

In [None]:
predictions.shape

In [None]:
h1n1_df = pd.DataFrame(predictions, columns=['h1n1_vaccine', 'not_h1n1_vaccine'])
submission_df['h1n1_vaccine'] = h1n1_df['h1n1_vaccine']
submission_df.head()

### Seasonal Vaccine

In [None]:
X_train = np.array(X_train)
y_train_seasonal = np.array(y_train['seasonal_target'])

In [None]:
X_train.shape

In [None]:
y_train_seasonal.shape

In [None]:
from sklearn.model_selection import train_test_split

# Dividir X_train y y_train_seasonal en datos de entrenamiento y validación
X_train_split, X_val_split, y_train_seasonal_split, y_val_seasonal_split = train_test_split(X_train, y_train_seasonal, test_size=0.2, random_state=42)

model.fit(X_train_split, y_train_seasonal_split,
          validation_data=(X_val_split, y_val_seasonal_split),
          epochs=15)

In [None]:
predictions = model.predict(X_test)

In [None]:
predictions.shape

In [None]:
seasonal_df = pd.DataFrame(predictions, columns=['seasonal_vaccine', 'not_seasonal_vaccine'])
submission_df['seasonal_vaccine'] = seasonal_df['seasonal_vaccine']
submission_df.head()

### Save in csv

In [None]:
submission_df.to_csv('submission.csv', index=False)