In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Load datasets
features_train = pd.read_csv("training_set_features.csv")
labels_train = pd.read_csv("training_set_labels.csv")
features_test = pd.read_csv("test_set_features.csv")


In [7]:
# Create an empty dataframe for predictions
prediction_df = pd.DataFrame({'respondent_id': features_test['respondent_id']})
# Merge training features and labels
merged_train_data = features_train.merge(labels_train, on="respondent_id")

# Separate features (X) and target variables (y)
X = merged_train_data.drop(columns=["respondent_id", "xyz_vaccine", "seasonal_vaccine"])
y = merged_train_data[["xyz_vaccine", "seasonal_vaccine"]]


In [8]:
# Categorize feature types
cat_features = X.select_dtypes(include=["object", "category"]).columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95))  # PCA for dimensionality reduction
])


In [9]:
# Combined preprocessing
preprocessor = ColumnTransformer([
    ("numerical", num_pipeline, num_features),
    ("categorical", cat_pipeline, cat_features),
])

# Define the XGBoost model with regularization
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    reg_lambda=1.0,  # L2 regularization
    reg_alpha=0.1    # L1 regularization
)
multi_target_model = MultiOutputClassifier(xgb_model)


In [5]:
# Complete pipeline
full_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", multi_target_model),
])

# Split training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
full_pipeline.fit(X_train, y_train)


Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(n_components=0.95))]),
                                                  Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_lar...
                                                               grow_policy=None,
                                                               importance_type=None,
  

In [10]:
# Predict on validation set
y_prob_val = full_pipeline.predict_proba(X_val)

# Evaluate using Mean ROC AUC
roc_auc_xyz = roc_auc_score(y_val["xyz_vaccine"], y_prob_val[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val["seasonal_vaccine"], y_prob_val[1][:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"Mean ROC AUC: {mean_roc_auc:.4f}")

# Predict on test set
test_probs = full_pipeline.predict_proba(features_test.drop(columns="respondent_id"))

# Prepare prediction file
prediction_df['xyz_vaccine'] = test_probs[0][:, 1]
prediction_df['seasonal_vaccine'] = test_probs[1][:, 1]
prediction_df.to_csv("prediction.csv", index=False)  


Mean ROC AUC: 0.8191
