In [1]:
import pandas as pd


training_set_features = pd.read_csv('training_set_features.csv')
test_set_features = pd.read_csv('test_set_features.csv')
training_set_labels = pd.read_csv('training_set_labels.csv')
submission_format = pd.read_csv('submission_format.csv')


print(training_set_features.head())



   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

print(training_set_features.isnull().sum())
print(test_set_features.isnull().sum())


train_features = training_set_features.drop(columns=['respondent_id'])
test_features = test_set_features.drop(columns=['respondent_id'])


X_train = train_features
y_train = training_set_labels[['xyz_vaccine', 'seasonal_vaccine']]


numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()


numerical_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])

categorical_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(transformers=[('num', numerical_pipeline, numerical_features),('cat', categorical_pipeline, categorical_features)])


X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(test_features)


respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [14]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_classifier = MultiOutputClassifier(base_classifier, n_jobs=-1)
multi_target_classifier.fit(X_train_processed, y_train)

In [15]:
from sklearn.metrics import roc_auc_score
y_pred_prob_train = multi_target_classifier.predict_proba(X_train_processed)
xyz_vaccine_pred_prob_train = y_pred_prob_train[0][:, 1]
seasonal_vaccine_pred_prob_train = y_pred_prob_train[1][:, 1]
xyz_vaccine_auc = roc_auc_score(y_train['xyz_vaccine'], xyz_vaccine_pred_prob_train)
seasonal_vaccine_auc = roc_auc_score(y_train['seasonal_vaccine'], seasonal_vaccine_pred_prob_train)

print(f'ROC AUC score for xyz_vaccine: {xyz_vaccine_auc}')
print(f'ROC AUC score for seasonal_vaccine: {seasonal_vaccine_auc}')
print(f'Mean ROC AUC score: {(xyz_vaccine_auc + seasonal_vaccine_auc) / 2}')


ROC AUC score for xyz_vaccine: 1.0
ROC AUC score for seasonal_vaccine: 1.0
Mean ROC AUC score: 1.0


In [None]:
y_pred_prob_test = multi_target_classifier.predict_proba(X_test_processed)
xyz_vaccine_pred_prob_test = y_pred_prob_test[0][:, 1]
seasonal_vaccine_pred_prob_test = y_pred_prob_test[1][:, 1]


submission = pd.DataFrame({'respondent_id': test_set_features['respondent_id'],'xyz_vaccine': xyz_vaccine_pred_prob_test,'seasonal_vaccine': seasonal_vaccine_pred_prob_test})
submission.to_csv('submission.csv', index=False)


In [None]:
submit=pd.read_csv("submission.csv")
submit.head()