In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

train_features =pd.read_csv(r"C:\Users\Pauld\Downloads\Traing set features.csv")
train_labels = pd.read_csv(r"C:\Users\Pauld\Downloads\Training set labels.csv")
test_features = pd.read_csv(r"C:\Users\Pauld\Downloads\Test set geatures.csv")



In [2]:
# Ensure correct merge/join if required (e.g., on 'respondent_id')
train_data = train_features.merge(train_labels, on='respondent_id')

# Define features and labels
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [3]:
# Identify categorical and numerical features
categorical_features = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
    'employment_industry', 'employment_occupation']

numerical_features = [col for col in X_train.columns if col not in categorical_features]



In [4]:
# Fill missing values for categorical and numerical features
for column in categorical_features:
    X_train[column].fillna(X_train[column].mode()[0], inplace=True)
    X_val[column].fillna(X_val[column].mode()[0], inplace=True)
    X_test[column].fillna(X_test[column].mode()[0], inplace=True)
for column in numerical_features:
    X_train[column].fillna(X_train[column].mean(), inplace=True)
    X_val[column].fillna(X_val[column].mean(), inplace=True)
    X_test[column].fillna(X_test[column].mean(), inplace=True)



In [5]:
# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_val_encoded = encoder.transform(X_val[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_features))
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=encoder.get_feature_names_out(categorical_features))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_features))



In [6]:
# Drop original categorical columns and concatenate encoded columns
X_train = X_train.drop(columns=categorical_features)
X_val = X_val.drop(columns=categorical_features)
X_test = X_test.drop(columns=categorical_features)

X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_val = pd.concat([X_val.reset_index(drop=True), X_val_encoded_df.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)



In [7]:
# Standardize numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler.transform(X_val[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])



In [8]:
# Train the model
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

# Predict probabilities on the validation set
y_pred_proba_val = model.predict_proba(X_val)
y_pred_proba_val_combined = np.hstack([y_pred_proba_val[i][:, 1].reshape(-1, 1) for i in range(len(y_pred_proba_val))])



In [9]:
# Calculate the ROC AUC score
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba_val_combined[:, 0])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba_val_combined[:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"ROC AUC Score for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC Score for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC Score: {mean_roc_auc}")



ROC AUC Score for xyz_vaccine: 0.834407172091538
ROC AUC Score for seasonal_vaccine: 0.8564177491422684
Mean ROC AUC Score: 0.8454124606169032


In [12]:
# Predict probabilities on the test set
y_pred_proba_test = model.predict_proba(X_test)
y_pred_proba_test_combined = np.hstack([y_pred_proba_test[i][:, 1].reshape(-1, 1) for i in range(len(y_pred_proba_test))])



In [14]:
# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_pred_proba_test_combined[:, 0],
    'seasonal_vaccine': y_pred_proba_test_combined[:, 1]
})

submission.to_csv('submission.csv', index=False)
print("The Submission File is created")

The Submission File is created
