In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')

# Merge train_features and train_labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Separate features and target variables
X = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_h1n1 = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']

# Split data into train and validation sets
X_train, X_val, y_train_h1n1, y_val_h1n1, y_train_seasonal, y_val_seasonal = train_test_split(
    X, y_h1n1, y_seasonal, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Handle categorical features including 'age_group'
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Evaluate each classifier using ROC AUC score
for clf_name, clf in classifiers.items():
    # Create pipeline for each classifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])
    
    # Fit model
    pipeline.fit(X_train, y_train_h1n1)
    
    # Predict probabilities on validation set
    y_prob_h1n1 = pipeline.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_val_h1n1, y_prob_h1n1)
    
    print(f"{clf_name} ROC AUC Score: {roc_auc}")

# Choose the best performing classifier for each target and make predictions for submission
best_classifier_h1n1 = RandomForestClassifier(n_estimators=100, random_state=42)  # Example: Choose the best classifier
best_classifier_seasonal = GradientBoostingClassifier(n_estimators=100, random_state=42)  # Example: Choose the best classifier

# Fit best classifiers on full training data
pipeline_h1n1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_classifier_h1n1)
])

pipeline_seasonal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_classifier_seasonal)
])

pipeline_h1n1.fit(X, y_h1n1)
pipeline_seasonal.fit(X, y_seasonal)

# Make predictions on submission format
test_features = pd.read_csv('test_set_features.csv')
predictions_h1n1 = pipeline_h1n1.predict_proba(test_features.drop('respondent_id', axis=1))[:, 1]
predictions_seasonal = pipeline_seasonal.predict_proba(test_features.drop('respondent_id', axis=1))[:, 1]

# Prepare submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'h1n1_vaccine': predictions_h1n1,
    'seasonal_vaccine': predictions_seasonal
})

# Save submission
submission.to_csv('submission.csv', index=False)
