In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [2]:

# Load datasets
df_features = pd.read_csv('training_set_features.csv')
df_labels = pd.read_csv('training_set_labels.csv')
df_test = pd.read_csv('test_set_features.csv')

# Merge features and labels on respondent_id for training
df_train = df_features.merge(df_labels, on='respondent_id')

# Preprocessing
X = df_train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = df_train[['xyz_vaccine', 'seasonal_vaccine']]
X_test = df_test.drop(columns=['respondent_id'])

# Identifying categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

# Create and evaluate the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Validate the model
y_pred = clf.predict_proba(X_valid)

# Calculate ROC AUC score for both targets
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred[1][:, 1])
roc_auc_mean = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {roc_auc_mean}")

# Make predictions on the test set
y_test_pred = clf.predict_proba(X_test)
test_preds = pd.DataFrame({
    'respondent_id': df_test['respondent_id'],
    'xyz_vaccine': y_test_pred[0][:, 1],
    'seasonal_vaccine': y_test_pred[1][:, 1]
})

# Save the predictions to a CSV file
test_preds.to_csv('submission.csv', index=False)


ROC AUC for xyz_vaccine: 0.864173999277244
ROC AUC for seasonal_vaccine: 0.8570519011081396
Mean ROC AUC: 0.8606129501926918


In [3]:
# Load datasets
df_features = pd.read_csv('training_set_features.csv')
df_labels = pd.read_csv('training_set_labels.csv')
df_test = pd.read_csv('test_set_features.csv')

# Merge features and labels on respondent_id for training
df_train = df_features.merge(df_labels, on='respondent_id')

# Preprocessing
X = df_train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = df_train[['xyz_vaccine', 'seasonal_vaccine']]
X_test = df_test.drop(columns=['respondent_id'])

# Identifying categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define models to evaluate
models = {
    'RandomForest': MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)),
    'GradientBoosting': MultiOutputClassifier(GradientBoostingClassifier(n_estimators=100, random_state=42)),
    'LogisticRegression': MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))
}

# Evaluate each model using cross-validation
results = {}
for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    # Perform cross-validation and calculate mean ROC AUC score for both targets
    scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc_ovr_weighted')
    results[name] = np.mean(scores)
    print(f"{name} ROC AUC: {results[name]}")

# Select the best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with ROC AUC: {results[best_model_name]}")

# Train the best model on the entire training set
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', best_model)])
clf.fit(X, y)

# Make predictions on the test set
y_test_pred = clf.predict_proba(X_test)
test_preds = pd.DataFrame({
    'respondent_id': df_test['respondent_id'],
    'xyz_vaccine': y_test_pred[0][:, 1],
    'seasonal_vaccine': y_test_pred[1][:, 1]
})

# Save the predictions to a CSV file
test_preds.to_csv('submission1.csv', index=False)


RandomForest ROC AUC: 0.8545446965539579
