In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [27]:
# Load datasets
train_data_x = pd.read_csv('training_set_features.csv')
test_data_x = pd.read_csv('test_set_features.csv')
train_data_y = pd.read_csv('training_set_labels.csv')

In [28]:
# Save respondent_ids from the test set for submission
respondent_ids = test_data_x['respondent_id']

In [29]:
# Drop respondent_id as it's not a feature
train_data_x.drop(columns=['respondent_id'], inplace=True)
test_data_x.drop(columns=['respondent_id'], inplace=True)

In [30]:
# Fill missing values in object columns with mode
for col in train_data_x.select_dtypes(include='object').columns:
    max_occurrence_train = train_data_x[col].mode()[0]
    train_data_x[col].fillna(max_occurrence_train, inplace=True)
    test_data_x[col].fillna(max_occurrence_train, inplace=True)

In [31]:
# Fill missing values in numeric columns with mean or mode for binary columns
for col in train_data_x.select_dtypes(include=['float', 'int']).columns:
    unique_values_train = train_data_x[col].dropna().unique()
    if set(unique_values_train) == {1, 0}:
        max_occurrence_train = train_data_x[col].mode()[0]
        train_data_x[col].fillna(max_occurrence_train, inplace=True)
        test_data_x[col].fillna(max_occurrence_train, inplace=True)
    else:
        mean_value_train = train_data_x[col].mean()
        train_data_x[col].fillna(mean_value_train, inplace=True)
        test_data_x[col].fillna(mean_value_train, inplace=True)

In [32]:
# Min-max scaling numeric columns
scaler = MinMaxScaler()
numeric_columns = train_data_x.select_dtypes(include=['float', 'int']).columns
train_data_x[numeric_columns] = scaler.fit_transform(train_data_x[numeric_columns])
test_data_x[numeric_columns] = scaler.transform(test_data_x[numeric_columns])

In [33]:
# Label encoding for object-type columns
label_encoder = LabelEncoder()
for col in train_data_x.select_dtypes(include='object').columns:
    train_data_x[col] = label_encoder.fit_transform(train_data_x[col])
    # Fit the label encoder on the combined data to ensure consistency
    test_data_x[col] = label_encoder.transform(test_data_x[col])

In [34]:
# Prepare the model
model = MultiOutputClassifier(RandomForestClassifier(random_state=42, n_estimators=100))
model.fit(train_data_x, train_data_y[['xyz_vaccine', 'seasonal_vaccine']])

In [35]:
# Predict probabilities on the test set
y_pred_prob = model.predict_proba(test_data_x)
print(y_pred_prob)

[array([[0.88, 0.12],
       [0.92, 0.08],
       [0.52, 0.48],
       ...,
       [0.86, 0.14],
       [0.95, 0.05],
       [0.56, 0.44]]), array([[0.66, 0.34],
       [0.92, 0.08],
       [0.2 , 0.8 ],
       ...,
       [0.68, 0.32],
       [0.64, 0.36],
       [0.42, 0.58]])]


In [36]:
# Convert list of tuples to a 2D array
y_pred_prob_array = np.array([prob[:, 1] for prob in y_pred_prob]).T

In [37]:
# Create the submission DataFrame with rounded probabilities
submission_df = pd.DataFrame({
    'respondent_id': respondent_ids,
    'xyz_vaccine': np.round(y_pred_prob_array[:, 0], 1),
    'seasonal_vaccine': np.round(y_pred_prob_array[:, 1], 1)
})

print(submission_df.head())
# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707          0.1               0.3
1          26708          0.1               0.1
2          26709          0.5               0.8
3          26710          0.5               0.9
4          26711          0.2               0.4
