In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [119]:
label = pd.read_csv('../data/training_set_labels.csv')
label.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [120]:
train = pd.read_csv('../data/training_set_features.csv')
train['h1n1_target'] = label['h1n1_vaccine']
train['seasonal_target'] = label['seasonal_vaccine']
train.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_target,seasonal_target
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [121]:
test = pd.read_csv('../data/test_set_features.csv')
test.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [122]:
submission_df = pd.DataFrame(test['respondent_id'])
submission_df.head()

Unnamed: 0,respondent_id
0,26707
1,26708
2,26709
3,26710
4,26711


## Data preprocessing

In [123]:
# Eliminar respondent_id
train = train.drop(['respondent_id'], axis=1)
test = test.drop(['respondent_id'], axis=1)

In [124]:
# One hot encoding in categorical columns in 0 and 1
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Transformar True y False en 1 y 0
train = train * 1
test = test * 1

In [125]:
train.isna().sum()

h1n1_concern                       92
h1n1_knowledge                    116
behavioral_antiviral_meds          71
behavioral_avoidance              208
behavioral_face_mask               19
                                 ... 
employment_occupation_vlluhbov      0
employment_occupation_xgwztkwe      0
employment_occupation_xqwwgdyp      0
employment_occupation_xtkaffoo      0
employment_occupation_xzmlyyjv      0
Length: 107, dtype: int64

In [126]:
test.isna().sum()

h1n1_concern                       85
h1n1_knowledge                    122
behavioral_antiviral_meds          79
behavioral_avoidance              213
behavioral_face_mask               19
                                 ... 
employment_occupation_vlluhbov      0
employment_occupation_xgwztkwe      0
employment_occupation_xqwwgdyp      0
employment_occupation_xtkaffoo      0
employment_occupation_xzmlyyjv      0
Length: 105, dtype: int64

In [127]:
# # Eliminar columnas con valores nulos
# train = train.dropna()
# test = test.dropna()

# Imputar valores nulos con la media para las columnas numéricas
train = train.fillna(train.mean())
test = test.fillna(test.mean())

In [128]:
X_train = train.drop(['h1n1_target', 'seasonal_target'], axis=1)
y_train = train[['h1n1_target', 'seasonal_target']]
X_test = test

In [129]:
print("El tamaño de X_train es: ", X_train.shape)   
print("El tamaño de y_train es: ", y_train.shape)
print("El tamaño de X_test es: ", X_test.shape)

El tamaño de X_train es:  (26707, 105)
El tamaño de y_train es:  (26707, 2)
El tamaño de X_test es:  (26708, 105)


## Neural Network

In [130]:
import tensorflow as tf

In [131]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=[105]),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### H1N1 Vaccine

In [132]:
X_train = np.array(X_train)
y_train_h1n1 = np.array(y_train['h1n1_target'])

In [133]:
X_train.shape

(26707, 105)

In [134]:
y_train_h1n1.shape

(26707,)

In [None]:
from sklearn.model_selection import train_test_split

# Dividir X_train y y_train_seasonal en datos de entrenamiento y validación
X_train_split, X_val_split, y_train_h1n1_split, y_val_h1n1_split = train_test_split(X_train, y_train_h1n1, test_size=0.2, random_state=42)

model.fit(X_train_split, y_train_h1n1_split,
          validation_data=(X_val_split, y_val_h1n1_split),
          epochs=10)

Epoch 1/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8119 - loss: 0.4458 - val_accuracy: 0.8306 - val_loss: 0.3948
Epoch 2/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8300 - loss: 0.3920 - val_accuracy: 0.8287 - val_loss: 0.3954
Epoch 3/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8356 - loss: 0.3816 - val_accuracy: 0.8358 - val_loss: 0.3880
Epoch 4/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8405 - loss: 0.3701 - val_accuracy: 0.8345 - val_loss: 0.3862
Epoch 5/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8432 - loss: 0.3720 - val_accuracy: 0.8366 - val_loss: 0.3837
Epoch 6/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8396 - loss: 0.3689 - val_accuracy: 0.8399 - val_loss: 0.3896
Epoch 7/10
[1m668/668[0m 

<keras.src.callbacks.history.History at 0x7fc7b1eb3380>

In [136]:
# Predict
predictions = model.predict(X_test)

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 525us/step


In [137]:
predictions.shape

(26708, 2)

In [138]:
h1n1_df = pd.DataFrame(predictions, columns=['h1n1_vaccine', 'not_h1n1_vaccine'])
submission_df['h1n1_vaccine'] = h1n1_df['h1n1_vaccine']
submission_df.head()

Unnamed: 0,respondent_id,h1n1_vaccine
0,26707,0.815577
1,26708,0.977051
2,26709,0.415741
3,26710,0.239104
4,26711,0.815838


### Seasonal Vaccine

In [139]:
X_train = np.array(X_train)
y_train_seasonal = np.array(y_train['seasonal_target'])

In [140]:
X_train.shape

(26707, 105)

In [141]:
y_train_seasonal.shape

(26707,)

In [None]:
from sklearn.model_selection import train_test_split

# Dividir X_train y y_train_seasonal en datos de entrenamiento y validación
X_train_split, X_val_split, y_train_seasonal_split, y_val_seasonal_split = train_test_split(X_train, y_train_seasonal, test_size=0.2, random_state=42)

model.fit(X_train_split, y_train_seasonal_split,
          validation_data=(X_val_split, y_val_seasonal_split),
          epochs=10)

Epoch 1/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7432 - loss: 0.5384 - val_accuracy: 0.7746 - val_loss: 0.4805
Epoch 2/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7791 - loss: 0.4733 - val_accuracy: 0.7800 - val_loss: 0.4758
Epoch 3/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7804 - loss: 0.4641 - val_accuracy: 0.7802 - val_loss: 0.4732
Epoch 4/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7896 - loss: 0.4503 - val_accuracy: 0.7782 - val_loss: 0.4816
Epoch 5/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7941 - loss: 0.4438 - val_accuracy: 0.7806 - val_loss: 0.4856
Epoch 6/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8030 - loss: 0.4284 - val_accuracy: 0.7748 - val_loss: 0.4937
Epoch 7/10
[1m668/668[0m 

<keras.src.callbacks.history.History at 0x7fc7b1d85100>

In [None]:
predictions = model.predict(X_test)

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 463us/step


In [144]:
predictions.shape

(26708, 2)

In [145]:
seasonal_df = pd.DataFrame(predictions, columns=['seasonal_vaccine', 'not_seasonal_vaccine'])
submission_df['seasonal_vaccine'] = seasonal_df['seasonal_vaccine']
submission_df.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.815577,0.98555
1,26708,0.977051,0.994756
2,26709,0.415741,0.298606
3,26710,0.239104,0.232079
4,26711,0.815838,0.192207


### Save in csv

In [146]:
submission_df.to_csv('submission.csv', index=False)