In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


## Load features and label data


In [48]:
features = pd.read_csv('training_set_features.csv')
labels = pd.read_csv('training_set_labels.csv')


data = pd.merge(features, labels, on='respondent_id')


X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = data[['xyz_vaccine', 'seasonal_vaccine']]






## Preprocessing

In [49]:



categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

## Fit the model


In [50]:


model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_val_pred = model.predict_proba(X_val)

## Calculate ROC AUC score

In [51]:




roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred[1][:, 1])
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f"Mean ROC AUC: {mean_roc_auc}")




Mean ROC AUC: 0.8580785423031334


## Load the test data features

In [52]:
# Load the test set and prepare for prediction
test = pd.read_csv('test_set_features.csv')
X_test = test.drop(columns=['respondent_id'])

# Predict on the test set
y_test_pred = model.predict_proba(X_test)
xyz_vaccine_pred = y_test_pred[0][:, 1]
seasonal_vaccine_pred = y_test_pred[1][:, 1]



## Prepare the submission file


In [53]:
# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test['respondent_id'],
    'xyz_vaccine': xyz_vaccine_pred,
    'seasonal_vaccine': seasonal_vaccine_pred
})

# Save the submission file
submission.to_csv('mysubmission.csv', index=False)