### Explore

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

# Load the datasets
train_features = pd.read_csv('C:\\Users\\USER\\OneDrive\\Desktop\\training_set_features.csv')
train_labels = pd.read_csv('C:\\Users\\USER\\OneDrive\\Desktop\\training_set_labels.csv')
test_features = pd.read_csv('C:\\Users\\USER\\OneDrive\\Desktop\\test_set_features.csv')
submission_format = pd.read_csv('C:\\Users\\USER\\OneDrive\\Desktop\\submission_format.csv')

# Display basic information
print(train_features.info())
print(train_labels.info())
print(test_features.info())

# Merge the training features and labels for convenience
train_data = pd.merge(train_features, train_labels, on='respondent_id')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [31]:
print(train_data.isnull().sum())

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [7]:
print(train_data[train_data.duplicated()])

Empty DataFrame
Columns: [respondent_id, xyz_concern, xyz_knowledge, behavioral_antiviral_meds, behavioral_avoidance, behavioral_face_mask, behavioral_wash_hands, behavioral_large_gatherings, behavioral_outside_home, behavioral_touch_face, doctor_recc_xyz, doctor_recc_seasonal, chronic_med_condition, child_under_6_months, health_worker, health_insurance, opinion_xyz_vacc_effective, opinion_xyz_risk, opinion_xyz_sick_from_vacc, opinion_seas_vacc_effective, opinion_seas_risk, opinion_seas_sick_from_vacc, age_group, education, race, sex, income_poverty, marital_status, rent_or_own, employment_status, hhs_geo_region, census_msa, household_adults, household_children, employment_industry, employment_occupation, xyz_vaccine, seasonal_vaccine]
Index: []

[0 rows x 38 columns]


### Clean

In [32]:
# Separate features and target variables
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Preprocess the training data
X_processed = preprocessor.fit_transform(X)



### Train

In [33]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define the models
log_reg = MultiOutputClassifier(LogisticRegression(max_iter=1000))
rf_clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
gb_clf = MultiOutputClassifier(GradientBoostingClassifier(n_estimators=100, random_state=42))

# Train the models
log_reg.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
gb_clf.fit(X_train, y_train)


### Evaluation

In [34]:
# Predict probabilities
log_reg_pred = log_reg.predict_proba(X_val)
rf_clf_pred = rf_clf.predict_proba(X_val)
gb_clf_pred = gb_clf.predict_proba(X_val)

# Calculate ROC AUC score for each target
log_reg_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], log_reg_pred[0][:, 1])
log_reg_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], log_reg_pred[1][:, 1])
rf_clf_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], rf_clf_pred[0][:, 1])
rf_clf_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], rf_clf_pred[1][:, 1])
gb_clf_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], gb_clf_pred[0][:, 1])
gb_clf_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], gb_clf_pred[1][:, 1])

# Print AUC scores
print(f"Logistic Regression - XYZ AUC: {log_reg_auc_xyz}, Seasonal AUC: {log_reg_auc_seasonal}")
print(f"Random Forest - XYZ AUC: {rf_clf_auc_xyz}, Seasonal AUC: {rf_clf_auc_seasonal}")
print(f"Gradient Boosting - XYZ AUC: {gb_clf_auc_xyz}, Seasonal AUC: {gb_clf_auc_seasonal}")


Logistic Regression - XYZ AUC: 0.8313867248233029, Seasonal AUC: 0.8560581587986521
Random Forest - XYZ AUC: 0.8294726823487887, Seasonal AUC: 0.8518703002226553
Gradient Boosting - XYZ AUC: 0.838986692047164, Seasonal AUC: 0.8608546254424846


### Submission

In [36]:
# Choose the best model based on validation performance (e.g., Gradient Boosting)
best_model = gb_clf

# Preprocess the test data
X_test_processed = preprocessor.transform(test_features.drop(columns=['respondent_id']))

# Predict probabilities on the test data
test_predictions = best_model.predict_proba(X_test_processed)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_predictions[0][:, 1],
    'seasonal_vaccine': test_predictions[1][:, 1]
})

print(submission)


       respondent_id  xyz_vaccine  seasonal_vaccine
0              26707     0.078160          0.237642
1              26708     0.038951          0.044384
2              26709     0.448829          0.769918
3              26710     0.578219          0.839197
4              26711     0.236440          0.536293
...              ...          ...               ...
26703          53410     0.361203          0.559046
26704          53411     0.110133          0.304597
26705          53412     0.116404          0.185369
26706          53413     0.055132          0.367869
26707          53414     0.466873          0.642551

[26708 rows x 3 columns]
