In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

# Load the data
data = pd.read_csv("training_set_labels.csv")

# Separate features and target variables
features = data.drop(["respondent_id", "xyz_vaccine", "seasonal_vaccine"], axis=1)
target_variables = data[["xyz_vaccine", "seasonal_vaccine"]]

# Encode categorical features
label_encoder = LabelEncoder()
for col in features.select_dtypes(include=["object"]):
    features[col] = label_encoder.fit_transform(features[col])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target_variables, test_size=0.2, random_state=42
)

# Build a Multi-Output Random Forest Classifier
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100))
model.fit(X_train, y_train)

# Predict probabilities for test data
y_pred = model.predict_proba(X_test)

# Prepare submission file with respondent ID and predicted probabilities
submission_data = pd.DataFrame({
    "respondent_id": data["respondent_id"].iloc[X_test.index],
    "xyz_vaccine": y_pred[:, 0],
    "seasonal_vaccine": y_pred[:, 1],
})

# Save predictions to CSV file
submission_data.to_csv("flu_vaccine_predictions.csv", index=False)

print("Model trained and predictions saved!")


ValueError: at least one array or dtype is required