In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

Load Data

In [None]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

Merge Data

In [None]:
data = train_features.merge(train_labels, on='respondent_id')

Identify categorical and numerical features

In [None]:
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
                        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
                        'employment_industry', 'employment_occupation']
numerical_features = [col for col in train_features.columns if col not in categorical_features + ['respondent_id']]

Preprocessing the Data

In [None]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = data[['xyz_vaccine', 'seasonal_vaccine']]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(random_state=42)

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', MultiOutputClassifier(model))])

In [None]:
pipeline.fit(X_train, y_train

Model Evaluation


In [None]:
y_valid_pred_proba = pipeline.predict_proba(X_valid)

In [None]:
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_valid_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_valid_pred_proba[1][:, 1])
roc_auc_mean = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC Score for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC Score for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC Score: {roc_auc_mean}')