In [392]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Load datasets
df_x = pd.read_csv('training_set_features.csv')
df_y = pd.read_csv('training_set_labels.csv')
df_test = pd.read_csv('test_set_features.csv')

# Fill missing values with the mode for test and train datasets
df_x = df_x.fillna(df_x.mode().iloc[0])
df_test = df_test.fillna(df_test.mode().iloc[0])

# Drop unnecessary columns
columns_to_drop = ['employment_industry', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_occupation']
df_x = df_x.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

# Replace categorical data with numerical codes
categorical_replacements = {
    'education': {'< 12 Years': 0, '12 Years': 1, 'Some College': 2, 'College Graduate': 3},
    'race': {'White': 0, 'Black': 1, 'Hispanic': 2, 'Other or Multiple': 3},
    'sex': {'Male': 0, 'Female': 1},
    'marital_status': {'Married': 0, 'Not Married': 1},
    'rent_or_own': {'Own': 0, 'Rent': 1},
    'age_group': {'65+ Years': 0, '55 - 64 Years': 1, '45 - 54 Years': 2, '18 - 34 Years': 3, '35 - 44 Years': 4},
    'income_poverty': {'<= $75,000, Above Poverty': 0, '> $75,000': 1, 'Below Poverty': 2}
}

df_x.replace(categorical_replacements, inplace=True)
df_test.replace(categorical_replacements, inplace=True)

# Define the features and target
X_train = df_x
y_train = df_y[['xyz_vaccine', 'seasonal_vaccine']]  # Ensure you have the correct target columns

# Define categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))
])

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = model.predict_proba(df_test)

# Convert predictions to probabilities for each target variable
y_pred_proba_xyz = y_pred_proba[0][:, 1]
y_pred_proba_seasonal = y_pred_proba[1][:, 1]

# Print the predictions
print("Predicted probabilities for xyz_vaccine:", y_pred_proba_xyz)
print("Predicted probabilities for seasonal_vaccine:", y_pred_proba_seasonal)

# Create the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': df_test['respondent_id']
    'xyz_vaccine': y_pred_proba_xyz,
    'seasonal_vaccine': y_pred_proba_seasonal
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_format.csv', index=False)

print("Submission file created successfully.")


df_x.columns


SyntaxError: invalid syntax. Perhaps you forgot a comma? (3810220280.py, line 75)