In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 1: Load Data
train_features_df = pd.read_csv('training_set_features.csv')
train_labels_df = pd.read_csv('training_set_labels.csv')
test_df = pd.read_csv('test_set_features.csv')

# Step 2: Merge Training Data
train_df = pd.merge(train_features_df, train_labels_df, on='respondent_id')

# Step 3: Preprocessing
# Define categorical and numerical features
categorical_features = train_features_df.select_dtypes(include=['object']).columns.tolist()
numerical_features = train_features_df.select_dtypes(include=['number']).columns.tolist()

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Separate features and target variables
X = train_df.drop(['xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = train_df[['xyz_vaccine', 'seasonal_vaccine']]



# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define the model
classifier = RandomForestClassifier(random_state=42)

# Step 5: Create a multi-output classifier
multi_target_classifier = MultiOutputClassifier(classifier)

# Step 6: Create a pipeline combining preprocessor and classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', multi_target_classifier)])

# Step 7: Train the model
pipeline.fit(X_train, y_train)

# Step 8: Predict probabilities for validation set
y_pred_proba = pipeline.predict_proba(X_val)

# Step 9: Calculate ROC AUC scores for evaluation
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba[1][:, 1])
overall_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2.0

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Overall ROC AUC: {overall_roc_auc}")

# Step 10: Predict probabilities for test set and create submission file
test_predictions = pipeline.predict_proba(test_df)
submission = pd.DataFrame({
    'respondent_id': test_df['respondent_id'],
    'xyz_vaccine': test_predictions[0][:, 1],
    'seasonal_vaccine': test_predictions[1][:, 1]
})
submission.to_csv('submission.csv', index=False)


ROC AUC for xyz_vaccine: 0.8290215902310297
ROC AUC for seasonal_vaccine: 0.852176333056302
Overall ROC AUC: 0.8405989616436659
