In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

In [3]:
# Load the datasets
train_features = pd.read_csv('C:\\Users\\sharm\\Downloads\\dataset and all\\training_set_features.csv')
train_labels = pd.read_csv('C:\\Users\\sharm\\Downloads\\dataset and all\\training_set_labels.csv')
test_features = pd.read_csv('C:\\Users\\sharm\\Downloads\\dataset and all\\test_set_features.csv')
submission_format = pd.read_csv('C:\\Users\\sharm\\Downloads\\dataset and all\\submission_format.csv')

In [4]:
# Explore the data
print(train_features.head())
print(train_labels.head())
print(test_features.head())

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [5]:
# Merge training features and labels
data = train_features.merge(train_labels, on='respondent_id')


In [6]:
# Define features and target variables
X = data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = data[['xyz_vaccine', 'seasonal_vaccine']]


In [7]:
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [8]:
# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [9]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [11]:
# Define the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

In [12]:
# Create and evaluate the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])


In [13]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Train the model
clf.fit(X_train, y_train)



In [15]:
# Predict and evaluate
y_pred = clf.predict_proba(X_val)


In [16]:
# Extract probabilities for each target
y_pred_xyz = y_pred[0][:, 1]
y_pred_seasonal = y_pred[1][:, 1]


In [17]:
# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_seasonal)

In [18]:
print(f'ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {(roc_auc_xyz + roc_auc_seasonal) / 2}')

ROC AUC for xyz vaccine: 0.8284626099891587
ROC AUC for seasonal vaccine: 0.8535626187491365
Mean ROC AUC: 0.8410126143691476


In [19]:
# Predict on the test set
test_preds = clf.predict_proba(test_features.drop(['respondent_id'], axis=1))

In [20]:
# Extract probabilities for each target
test_preds_xyz = test_preds[0][:, 1]
test_preds_seasonal = test_preds[1][:, 1]

In [21]:
# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_preds_xyz,
    'seasonal_vaccine': test_preds_seasonal
})

In [22]:
submission.to_csv('C:\\Users\\sharm\\Downloads\\dataset and all\\submission.csv', index=False)