In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Load the datasets
data_features = pd.read_csv('training_set_features.csv')
data_labels = pd.read_csv('training_set_labels.csv')
test_data_features = pd.read_csv('test_set_features.csv')

# Separate features and labels
X = data_features.drop(columns=['respondent_id'])
y = data_labels.drop(columns=['respondent_id'])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Define preprocessing steps
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Define the model
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(base_model)

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_proba = pipeline.predict_proba(X_val)

# Convert predictions to proper format
y_pred_proba_df = pd.DataFrame({
    'xyz_vaccine': [pred[1] for pred in y_pred_proba[0]],
    'seasonal_vaccine': [pred[1] for pred in y_pred_proba[1]]
}, index=X_val.index)

# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba_df['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba_df['seasonal_vaccine'])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

# Make predictions on the test set
test_features_processed = test_data_features.drop(columns=['respondent_id'])
test_pred_proba = pipeline.predict_proba(test_features_processed)

# Convert predictions to proper format
test_pred_proba_df = pd.DataFrame({
    'respondent_id': test_data_features['respondent_id'],
    'xyz_vaccine': [pred[1] for pred in test_pred_proba[0]],
    'seasonal_vaccine': [pred[1] for pred in test_pred_proba[1]]
})

# Save the predictions to a CSV file
output_file_path = 'predictions.csv'
test_pred_proba_df.to_csv(output_file_path, index=False)


ROC AUC for xyz_vaccine: 0.8294325525888947
ROC AUC for seasonal_vaccine: 0.8518072872366175
Mean ROC AUC: 0.8406199199127561
