In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

In [None]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [None]:
train_features = train_features.drop(columns=['respondent_id'])
test_features_ids = test_features['respondent_id']
test_features = test_features.drop(columns=['respondent_id'])

In [None]:
categorical_features = [
    'age_group', 'education', 'race', 'income_poverty', 'marital_status',
    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
    'employment_industry', 'employment_occupation'
]
numerical_features = [col for col in train_features.columns if col not in categorical_features]


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))
])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels[['xyz_vaccine', 'seasonal_vaccine']], test_size=0.2, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_val_pred_prob = model.predict_proba(X_val)


In [None]:
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_prob[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_prob[1][:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

In [None]:
print(f'Mean ROC AUC on validation set: {mean_roc_auc}')


In [None]:
test_pred_prob = model.predict_proba(test_features)

In [None]:
submission = pd.DataFrame({
    'respondent_id': test_features_ids,
    'xyz_vaccine': test_pred_prob[0][:, 1],
    'seasonal_vaccine': test_pred_prob[1][:, 1]
})

submission.to_csv('submission.csv', index=False)