In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_data_x = pd.read_csv('training_set_features.csv')
test_data_x = pd.read_csv('test_set_features.csv')
train_data_y = pd.read_csv('training_set_labels.csv')

In [32]:
for col in train_data_x.select_dtypes(include='object').columns:
    max_occurrence_train = train_data_x[col].mode()[0]
    train_data_x[col].fillna(max_occurrence_train, inplace=True)
    max_occurrence_test = test_data_x[col].mode()[0]
    test_data_x[col].fillna(max_occurrence_test, inplace=True)

In [33]:
for col in train_data_x.select_dtypes(include=['float','int']).columns:
    unique_values_train = train_data_x[col].dropna().unique()
    unique_values_test = test_data_x[col].dropna().unique()
    if set(unique_values_train) == {1, 0}:
        max_occurrence_train = train_data_x[col].mode()[0]
        train_data_x[col].fillna(max_occurrence_train, inplace=True)
        max_occurrence_test = test_data_x[col].mode()[0]
        test_data_x[col].fillna(max_occurrence_test, inplace=True)
    else:
        max_occurrence_train = train_data_x[col].mean()
        train_data_x[col].fillna(max_occurrence_train, inplace=True)
        max_occurrence_test = test_data_x[col].mean()
        test_data_x[col].fillna(max_occurrence_test, inplace=True)


In [34]:
scaler = MinMaxScaler()
numeric_columns = train_data_x.select_dtypes(include=['float']).columns
train_data_x[numeric_columns] = scaler.fit_transform(train_data_x[numeric_columns])
test_data_x[numeric_columns] = scaler.transform(test_data_x[numeric_columns])


In [35]:
label_encoder = LabelEncoder()
object_columns = train_data_x.select_dtypes(include='object').columns
for col in object_columns:
    train_data_x[col] = label_encoder.fit_transform(train_data_x[col])
    test_data_x[col] = label_encoder.transform(test_data_x[col])



In [55]:
model = MultiOutputClassifier(RandomForestClassifier(random_state=42, n_estimators=100))
model.fit(train_data_x, train_data_y.iloc[:,1:])

In [56]:
y_pred = model.predict_proba(test_data_x)
print(y_pred)

[array([[0.81, 0.19],
       [0.83, 0.17],
       [0.59, 0.41],
       ...,
       [0.82, 0.18],
       [0.8 , 0.2 ],
       [0.61, 0.39]]), array([[0.71, 0.29],
       [0.87, 0.13],
       [0.32, 0.68],
       ...,
       [0.61, 0.39],
       [0.59, 0.41],
       [0.49, 0.51]])]


In [57]:
respondent_ids = test_data_x['respondent_id']
y_pred_prob_array = np.array([prob[:, 1] for prob in y_pred]).T

In [58]:
final_data = pd.DataFrame({
    'respondent_id': respondent_ids,
    'xyz_vaccine': np.round(y_pred_prob_array[:, 0], 1),
    'seasonal_vaccine': np.round(y_pred_prob_array[:, 1], 1)
})

print(final_data.head())
# Save the submission DataFrame to a CSV file
final_data.to_csv('final_file.csv', index=False)

   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707          0.2               0.3
1          26708          0.2               0.1
2          26709          0.4               0.7
3          26710          0.6               0.8
4          26711          0.3               0.4
