In [73]:
import pandas as pd

train_set_labels = pd.read_csv("training_set_labels.csv")
train_set_features = pd.read_csv("training_set_features.csv")
test_set_features = pd.read_csv("test_set_features.csv")

In [126]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate

In [74]:
train_data = train_set_features.merge(train_set_labels, on='respondent_id', how='inner')

In [75]:
train_data = train_data.dropna()

In [76]:
train_data = train_data.dropna()

In [77]:
numerical_features = ['xyz_concern', 'xyz_knowledge']
binary_features = [col for col in train_data.columns if 'behavioral' in col or 'doctor_recc' in col or 'chronic_med_condition' in col or 'child_under_6_months' in col or 'health_worker' in col or 'health_insurance' in col]
categorical_features = [col for col in train_data.columns if col not in ['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'] + numerical_features + binary_features]

In [79]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Preprocess numerical features
numerical_transformer = StandardScaler()
numerical_transformer.fit(train_data[numerical_features])

# Preprocess categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer.fit(train_data[categorical_features])

# Impute missing values in binary features
binary_imputer = SimpleImputer(strategy='most_frequent')
binary_imputer.fit(train_data[binary_features])

# Combine the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', binary_imputer, binary_features)
    ])

# Apply the preprocessing to the training data
X_train = preprocessor.fit_transform(train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine']))
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]

In [132]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [133]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

clf = MultiOutputClassifier(RandomForestClassifier(random_state=42))
clf.fit(X_train, y_train)

In [134]:
from sklearn.metrics import roc_auc_score

y_pred = clf.predict_proba(X_test)
y_pred_xyz = y_pred[0][:, 1]
y_pred_seasonal = y_pred[1][:, 1]

roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

ROC AUC for xyz_vaccine: 0.8611362351611078
ROC AUC for seasonal_vaccine: 0.8492276894648315
Mean ROC AUC: 0.8551819623129697


In [135]:
# Preprocess the test set
test_set_features = test_set_features.fillna(test_set_features.mode().iloc[0])
X_test_final = preprocessor.transform(test_set_features.drop(columns=['respondent_id']))

# Make predictions on the test set
y_test_pred = clf.predict_proba(X_test_final)
y_test_pred_xyz = y_test_pred[0][:, 1]
y_test_pred_seasonal = y_test_pred[1][:, 1]

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': y_test_pred_xyz,
    'seasonal_vaccine': y_test_pred_seasonal
})

# Save to CSV
submission.to_csv('submission_train_test_randomforest.csv', index=False)