In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

# Load the datasets
X = pd.read_csv("training_set_features.csv")
y = pd.read_csv("training_set_labels.csv")
z = pd.read_csv("training_set_labels.csv")
final = pd.read_csv("test_set_features.csv")

print("Missing values in training_set_features.csv:\n", X.isna().sum())
print("\nMissing values in training_set_labels.csv (y):\n", y.isna().sum())
print("\nMissing values in training_set_labels.csv (z):\n", z.isna().sum())
print("\nMissing values in test_set_features.csv:\n", final.isna().sum())

X = X.drop(['respondent_id', 'health_insurance'], axis=1)
y = y.drop(['respondent_id', 'seasonal_vaccine'], axis=1)
z = z.drop(['respondent_id', 'xyz_vaccine'], axis=1)
respondent_id = final.pop('respondent_id')
final = final.drop(['health_insurance'], axis=1)

numerical_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=20, tol=1e-2, random_state=42)),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model(X_data, y_data):
    model_lgbm = LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    roc_auc_scores = []

    for train_index, test_index in skf.split(X_data, y_data):
        X_train_skf, X_test_skf = X_data.iloc[train_index], X_data.iloc[test_index]
        y_train_skf, y_test_skf = y_data.iloc[train_index], y_data.iloc[test_index]

        X_train_skf = preprocessor.fit_transform(X_train_skf)
        X_test_skf = preprocessor.transform(X_test_skf)

        model_lgbm.fit(X_train_skf, y_train_skf)
        y_pred_proba = model_lgbm.predict_proba(X_test_skf)[:, 1]
        roc_auc = roc_auc_score(y_test_skf, y_pred_proba)
        roc_auc_scores.append(roc_auc)

    return model_lgbm, np.mean(roc_auc_scores)

print("Evaluation for xyz_vaccine:")
model_y, roc_auc_y = evaluate_model(X, y['xyz_vaccine'])
print("Mean ROC AUC Score for xyz_vaccine =", roc_auc_y)

print("\nEvaluation for seasonal_vaccine:")
model_z, roc_auc_z = evaluate_model(X, z['seasonal_vaccine'])
print("Mean ROC AUC Score for seasonal_vaccine =", roc_auc_z)

final_mean_roc_auc = (roc_auc_y + roc_auc_z) / 2
print("\nOverall ROC AUC Score =", final_mean_roc_auc)

final_processed = preprocessor.transform(final)
xyz_vaccine_probs = model_y.predict_proba(final_processed)[:, 1]
seasonal_vaccine_probs = model_z.predict_proba(final_processed)[:, 1]

xyz_vaccine = np.round(xyz_vaccine_probs, 1)
seasonal_vaccine = np.round(seasonal_vaccine_probs, 1)

predictions_df = pd.DataFrame({
    'respondent_id': respondent_id,
    'xyz_vaccine': xyz_vaccine,
    'seasonal_vaccine': seasonal_vaccine
})

predictions_df.to_csv("results.csv", index=False)


Missing values in training_set_features.csv:
 respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                          