In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [None]:
train = pd.read_csv('../data/training_set_features.csv')
train.head()

In [None]:
label = pd.read_csv('../data/training_set_labels.csv')
label.head()

In [None]:
test = pd.read_csv('../data/test_set_features.csv')
test.head()

In [None]:
submission_df = pd.DataFrame(test['respondent_id'])
submission_df.head()

In [None]:
train['h1n1_target'] = label['h1n1_vaccine']
train['seasonal_target'] = label['seasonal_vaccine']
train.head()

## Data preprocessing

In [None]:
train.drop(columns=['respondent_id'], inplace=True)
test.drop(columns=['respondent_id'], inplace=True)

In [None]:
# One hot encoding in categorical columns in 0 and 1
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Transformar True y False en 1 y 0
train = train * 1
test = test * 1

In [None]:
# Comprobar las características más importantes según un Random Forest
from sklearn.ensemble import RandomForestClassifier

# Crear un clasificador de Random Forest
rf_h1n1 = RandomForestClassifier(n_estimators=1000, random_state=42)

# Entrenar el modelo
rf_h1n1.fit(train.drop(['h1n1_target', 'seasonal_target'], axis=1), train['seasonal_target'])

# Obtener la importancia de las características
importances = rf_h1n1.feature_importances_

# Crear un DataFrame con las características y su importancia
feature_importances = pd.DataFrame({'feature': train.drop(['h1n1_target', 'seasonal_target'], axis=1).columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)

# Mostrar las características que sobrepasen un umbral
threshold = 0.015
selected_features = feature_importances[feature_importances['importance'] > threshold]

In [None]:
# # Eliminar outliers
# Q1 = train.quantile(0.15)
# Q3 = train.quantile(0.85)
# IQR = Q3 - Q1

# train = train[~((train < (Q1 - 1.5 * IQR)) | (train > (Q3 + 1.5 * IQR))).any(axis=1)]

# # Imputar valores faltantes
# train = train.fillna(train.mean())

# # Eliminar de y las filas que se eliminaron de X
# label = label[label.index.isin(train.index)]

In [None]:
train.isna().sum()

In [None]:
for col in train.columns:
    train[col] = train[col].fillna(train[col].mode()[0])

In [None]:
# Mostrar matriz de correlación
corr = train.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Feature importance
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train, label['h1n1_vaccine'])

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(20, 10))
plt.title('Feature Importances')
plt.bar(range(train.shape[1]), importances[indices], align='center')
plt.xticks(range(train.shape[1]), train.columns[indices], rotation=90)
plt.show()

In [None]:
# selected_features = [
#  'h1n1_concern',
#  'behavioral_avoidance',
#  'behavioral_wash_hands',
#  'behavioral_large_gatherings',
#  'doctor_recc_seasonal',
#  'child_under_6_months',
#  'health_worker',
#  'health_insurance',
#  'opinion_seas_sick_from_vacc',
#  'household_adults',
#  'household_children'
# ]

# train = train[selected_features]
# test = test[selected_features]

In [None]:
y_h1n1_vaccine = label['h1n1_vaccine']
y_seasonal_vaccine = label['seasonal_vaccine']

## Random Forest

### H1N1 vaccine

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train_h1n1, X_test_h1n1, y_train_h1n1, y_test_h1n1 = train_test_split(train[selected_features], train['h1n1_target'], test_size=0.2, random_state=42)

rf_h1n1 = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_h1n1.fit(X_train_h1n1, y_train_h1n1)

In [None]:
y_test_pred_h1n1 = rf_h1n1.predict_proba(X_test_h1n1)[:, 1]

In [None]:
# Roc curve
from sklearn.metrics import roc_curve, roc_auc_score

fig, ax = plt.subplots()
fpr, tpr, thresholds = roc_curve(y_test_h1n1, y_test_pred_h1n1)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
ax.set_title(f"Roc curve AUC: {roc_auc_score(y_test_h1n1, y_test_pred_h1n1)}")
plt.show()

### Seasonal vaccine

In [None]:
X_train_seasonal, X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(train, y_seasonal_vaccine, test_size=0.2, random_state=42)

rf_seasonal = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_seasonal.fit(X_train_seasonal, y_train_seasonal)

In [None]:
y_test_pred_seasonal = rf_seasonal.predict_proba(X_test_seasonal)[:, 1]

In [None]:
# Roc curve
from sklearn.metrics import roc_curve, roc_auc_score

fig, ax = plt.subplots()
fpr, tpr, thresholds = roc_curve(y_test_seasonal, y_test_pred_seasonal)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
ax.set_title(f"Roc curve AUC: {roc_auc_score(y_test_seasonal, y_test_pred_seasonal)}")
plt.show()

## Convert submission dataset to csv

In [None]:
submission_df.head()

In [None]:
y_h1n1_submission_pred = rf_h1n1.predict_proba(test)[:, 1]
y_seasonal_submission_pred = rf_seasonal.predict_proba(test)[:, 1]

submission_df['h1n1_vaccine'] = y_h1n1_submission_pred
submission_df['seasonal_vaccine'] = y_seasonal_submission_pred

In [None]:
submission_df.to_csv('submission.csv', index=False)