<a href="https://colab.research.google.com/github/Vedant1710/DataHack_Project-Vaccine_VG1710/blob/main/DataHack_Project_Vaccine_VG1710.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


df_labels = pd.read_csv('/content/training_set_labels.csv')
df_features = pd.read_csv('/content/training_set_features.csv')
df_test_features = pd.read_csv('/content/test_set_features.csv')
df_submission_format = pd.read_csv('/content/submission_format.csv')


df_train = df_features.merge(df_labels, on='respondent_id')


X_features = df_train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_targets = df_train[['xyz_vaccine', 'seasonal_vaccine']]


categorical_cols = X_features.select_dtypes(include=['object']).columns
numerical_cols = X_features.select_dtypes(exclude=['object']).columns


numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])


multi_output_model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', multi_output_model)])


X_train, X_val, y_train, y_val = train_test_split(X_features, y_targets, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)


y_pred_proba = pipeline.predict_proba(X_val)


roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba[1][:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


df_test_with_ids = df_test_features.copy()
X_test_features = df_test_features.drop(columns=['respondent_id'])

test_proba = pipeline.predict_proba(X_test_features)
df_submission = pd.DataFrame({
    'respondent_id': df_test_with_ids['respondent_id'],
    'xyz_vaccine': test_proba[0][:, 1],
    'seasonal_vaccine': test_proba[1][:, 1]
})

df_submission.to_csv('Vedant Gupta_submission.csv', index=False)

ROC AUC for xyz_vaccine: 0.864173999277244
ROC AUC for seasonal_vaccine: 0.8570519011081396
Mean ROC AUC: 0.8606129501926918
