In [3]:
import pandas as pd

# Load the datasets
submission_format = pd.read_csv('submission_format.csv')
test_set_features = pd.read_csv('test_set_features.csv')
training_set_features = pd.read_csv('training_set_features.csv')
training_set_labels = pd.read_csv('training_set_labels.csv')

# Display the first few rows of each dataset
print(submission_format.head())
print(test_set_features.head())
print(training_set_features.head())
print(training_set_labels.head())

# Get basic information about the datasets
print(training_set_features.info())
print(training_set_labels.info())
print(test_set_features.info())

# Check for missing values
print(training_set_features.isnull().sum())
print(training_set_labels.isnull().sum())
print(test_set_features.isnull().sum())



   respondent_id  h1n1_vaccine  seasonal_vaccine
0          26707           0.5               0.7
1          26708           0.5               0.7
2          26709           0.5               0.7
3          26710           0.5               0.7
4          26711           0.5               0.7
   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0          26707          2.0            2.0                        0.0   
1          26708          1.0            1.0                        0.0   
2          26709          2.0            2.0                        0.0   
3          26710          1.0            1.0                        0.0   
4          26711          3.0            1.0                        1.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   1.0                   0.0                    1.0   
1                   0.0                   0.0                    0.0   
2                   0.0                

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Merge training features and labels
train_data = training_set_features.merge(training_set_labels, on='respondent_id')

# Separate features and target variables
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Define preprocessors for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Apply preprocessing to training data
X_processed = preprocessor.fit_transform(X)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define the model with increased max_iter
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))

# Train the model
model.fit(X_train, y_train)

# Make predictions on validation set
y_val_pred = model.predict_proba(X_val)

# Convert predictions to proper format
y_val_pred_proba = pd.DataFrame({
    'xyz_vaccine': y_val_pred[0][:, 1],
    'seasonal_vaccine': y_val_pred[1][:, 1]
})

# Evaluate the model
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba['seasonal_vaccine'])
average_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Average ROC AUC: {average_roc_auc}')



ROC AUC for xyz_vaccine: 0.8313566800292465
ROC AUC for seasonal_vaccine: 0.8560630982264491
Average ROC AUC: 0.8437098891278478


In [15]:
# Process the test data
X_test = test_set_features.drop(columns=['respondent_id'])
X_test_processed = preprocessor.transform(X_test)

# Make predictions on test data
test_pred = model.predict_proba(X_test_processed)

# Convert predictions to the required format
test_pred_proba = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': test_pred[0][:, 1],
    'seasonal_vaccine': test_pred[1][:, 1]
})

# Ensure the submission file contains the correct columns
submission = submission_format[['respondent_id']].merge(test_pred_proba, on='respondent_id')

# Save the predictions to a CSV file
submission.to_csv('submission.csv', index=False)
