In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [None]:
label = pd.read_csv('../data/training_set_labels.csv')
label.head()

In [None]:
train = pd.read_csv('../data/training_set_features.csv')
train['h1n1_target'] = label['h1n1_vaccine']
train['seasonal_target'] = label['seasonal_vaccine']
train.head()

In [None]:
test = pd.read_csv('../data/test_set_features.csv')
test.head()

In [None]:
submission_df = pd.DataFrame(test['respondent_id'])
submission_df.head()

## Data preprocessing

In [None]:
train.drop(columns=['respondent_id'], inplace=True)
test.drop(columns=['respondent_id'], inplace=True)

In [None]:
# One hot encoding in categorical columns in 0 and 1
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Transformar True y False en 1 y 0
train = train * 1
test = test * 1

In [None]:
# Comprobar las características más importantes según un Random Forest
from sklearn.ensemble import RandomForestClassifier

# Crear un clasificador de Random Forest
rf_h1n1 = RandomForestClassifier(n_estimators=1000, random_state=42)

# Entrenar el modelo
rf_h1n1.fit(train.drop(['h1n1_target', 'seasonal_target'], axis=1), train['h1n1_target'])

# Obtener la importancia de las características
importances = rf_h1n1.feature_importances_

# Crear un DataFrame con las características y su importancia
feature_importances = pd.DataFrame({'feature': train.drop(['h1n1_target', 'seasonal_target'], axis=1).columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)

# Mostrar las características que sobrepasen un umbral
threshold = 0.015
selected_features = feature_importances[feature_importances['importance'] > threshold]

In [None]:
# Seleccionar las características en el conjunto de entrenamiento y de prueba
train_h1n1 = train[list(selected_features['feature'])+ ['h1n1_target']]
test_h1n1 = test[list(selected_features['feature'])]

In [None]:
# Comprobar las características más importantes según un Random Forest
from sklearn.ensemble import RandomForestClassifier

# Crear un clasificador de Random Forest
rf_seasonal = RandomForestClassifier(n_estimators=1000, random_state=42)

# Entrenar el modelo
rf_seasonal.fit(train.drop(['h1n1_target', 'seasonal_target'], axis=1), train['seasonal_target'])

# Obtener la importancia de las características
importances = rf_seasonal.feature_importances_

# Crear un DataFrame con las características y su importancia
feature_importances = pd.DataFrame({'feature': train.drop(['h1n1_target', 'seasonal_target'], axis=1).columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)

# Mostrar las características que sobrepasen un umbral
threshold = 0.015
selected_features = feature_importances[feature_importances['importance'] > threshold]

In [None]:
# Seleccionar las características en el conjunto de entrenamiento y de prueba
train_seasonal = train[list(selected_features['feature'])+ ['seasonal_target']]
test_seasonal = test[list(selected_features['feature'])]

In [None]:
# Imputar valores nulos con la media para las columnas numéricas
train_h1n1 = train_h1n1.fillna(train_h1n1.mode())
train_seasonal = train_seasonal.fillna(train_seasonal.mode())
test_h1n1 = test_h1n1.fillna(test_h1n1.mode())
test_seasonal = test_seasonal.fillna(test_seasonal.mode())

In [None]:
X_train_h1n1 = train_h1n1.drop(['h1n1_target'], axis=1)
X_train_seasonal = train_seasonal.drop(['seasonal_target'], axis=1)
y_train_h1n1 = train_h1n1[['h1n1_target']]
y_train_seasonal = train_seasonal[['seasonal_target']]
X_test_h1n1 = test_h1n1
X_test_seasonal = test_seasonal

In [None]:
print("El tamaño de X_train_h1n1 es: ", X_train_h1n1.shape)   
print("El tamaño de y_train_h1n1 es: ", y_train_h1n1.shape)
print("El tamaño de X_test_h1n1 es: ", X_test_h1n1.shape)
print("El tamaño de X_train_seasonal es: ", X_train_seasonal.shape)
print("El tamaño de y_train_seasonal es: ", y_train_seasonal.shape)
print("El tamaño de X_test_seasonal es: ", X_test_seasonal.shape)

## Random Forest

### H1N1 Vaccine

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#X_train_h1n1, X_test_h1n1, y_train_h1n1, y_test_h1n1 = train_test_split(train, y_h1n1_vaccine, test_size=0.2, random_state=42)
X_train_h1n1_split, X_test_h1n1_split, y_train_h1n1_split, y_test_h1n1_split = train_test_split(X_train_h1n1, y_train_h1n1, test_size=0.2, random_state=42)

rf_h1n1 = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_h1n1.fit(X_train_h1n1_split, y_train_h1n1_split)

In [None]:
y_test_pred_h1n1 = rf_h1n1.predict_proba(X_test_h1n1_split)[:, 1]

In [None]:
# Roc curve
from sklearn.metrics import roc_curve, roc_auc_score

fig, ax = plt.subplots()
fpr, tpr, thresholds = roc_curve(y_test_h1n1_split, y_test_pred_h1n1)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
ax.set_title(f"Roc curve AUC: {roc_auc_score(y_test_h1n1_split, y_test_pred_h1n1)}")
plt.show()

### Seasonal vaccine

In [None]:
X_train_seasonal_split, X_test_seasonal_split, y_train_seasonal_split, y_test_seasonal_split = train_test_split(X_train_seasonal, y_train_seasonal, test_size=0.2, random_state=42)

rf_seasonal = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_seasonal.fit(X_train_seasonal_split, y_train_seasonal_split)

In [None]:
y_test_pred_seasonal = rf_seasonal.predict_proba(X_test_seasonal_split)[:, 1]

In [None]:
# Roc curve
from sklearn.metrics import roc_curve, roc_auc_score

fig, ax = plt.subplots()
fpr, tpr, thresholds = roc_curve(y_test_seasonal_split, y_test_pred_seasonal)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
ax.set_title(f"Roc curve AUC: {roc_auc_score(y_test_seasonal_split, y_test_pred_seasonal)}")
plt.show()

### Save in csv

In [None]:
y_h1n1_submission_pred = rf_h1n1.predict_proba(test_h1n1)[:, 1]
y_seasonal_submission_pred = rf_seasonal.predict_proba(test_seasonal)[:, 1]

submission_df['h1n1_vaccine'] = y_h1n1_submission_pred
submission_df['seasonal_vaccine'] = y_seasonal_submission_pred

In [None]:
submission_df.to_csv('submission.csv', index=False)