In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Adjust file paths to match the actual location on your system
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [3]:
# Merge training features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [4]:
# Define feature columns and target variables
feature_cols = train_features.columns[1:]  # Exclude respondent_id
target_vars = ['xyz_vaccine', 'seasonal_vaccine']

In [5]:
# Preprocessing pipeline
numeric_features = feature_cols[train_features[feature_cols].dtypes != 'object']
categorical_features = feature_cols[train_features[feature_cols].dtypes == 'object']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [6]:
# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

In [7]:
# Split data for validation
X = train_data[feature_cols]
y = train_data[target_vars]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train the model
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_hom...
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                  

In [10]:
# Predict and evaluate
y_pred_val = model.predict_proba(X_val)

# Calculate ROC AUC Score for each target variable separately
roc_auc_scores = []
for i, target in enumerate(target_vars):
    roc_auc = roc_auc_score(y_val[target], y_pred_val[i][:, 1])
    roc_auc_scores.append(roc_auc)
    print(f'Validation ROC AUC Score for {target}: {roc_auc}')

# Calculate the average ROC AUC Score if needed
average_roc_auc = sum(roc_auc_scores) / len(roc_auc_scores)
print(f'Average Validation ROC AUC Score: {average_roc_auc}')


Validation ROC AUC Score for xyz_vaccine: 0.8280715024077857
Validation ROC AUC Score for seasonal_vaccine: 0.8527313130509138
Average Validation ROC AUC Score: 0.8404014077293498


In [None]:
# Predict on test set
test_preds = model.predict_proba(test_features[feature_cols])
test_preds = [probs[:, 1] for probs in test_preds]  # Extract probabilities for the positive class

In [None]:
# Prepare submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_preds[0],
    'seasonal_vaccine': test_preds[1]
})

In [None]:
submission.to_csv('submission_format.csv', index=False)