In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## LOADING THE GIVEN DATASETS

In [2]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

## MERGING FEATURES AND LABELS

In [3]:
train_data = train_features.merge(train_labels, on='respondent_id')

In [4]:
import csv

def get_column_names(csv_file):
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        return next(reader)

csv_file = 'test_set_features.csv'
column_names = get_column_names(csv_file)
print(column_names)

['respondent_id', 'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children', 'employment_industry', 'employment_occupation']


In [5]:
import csv

def get_num_columns(csv_file):
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        return len(next(reader))

csv_file = 'test_set_features.csv'
num_columns = get_num_columns(csv_file)
print(f'Number of columns: {num_columns}')

Number of columns: 36


## SEPERATE THE FEATURE COLUMNS

In [6]:
categorical_features = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 
    'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 
    'census_msa', 'employment_industry', 'employment_occupation'
]
numerical_features = [
    'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 
    'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 
    'behavioral_large_gatherings', 'behavioral_outside_home', 
    'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 
    'chronic_med_condition', 'child_under_6_months', 'health_worker', 
    'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 
    'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 
    'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 
    'household_children'
]

## Preprocessing pipelines for both numerical and categorical data

In [7]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Combine both transformers into a single column transformer

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Preprocessing training data

In [9]:
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])

X_preprocessed = preprocessor.fit_transform(X)
X_test_preprocessed = preprocessor.transform(X_test)

## Combine both transformers into a single column transformer

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Preprocessing training data

In [11]:
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])

X_preprocessed = preprocessor.fit_transform(X)
X_test_preprocessed = preprocessor.transform(X_test)


## Splitting training data for verification

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Model building and Evaluation

In [13]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

## Different Models

In [14]:
models = {
    'SVM': SVC(probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': GaussianNB()
}

## Dictionary to socre ROC AUC scores

In [15]:
roc_auc_scores = {}

## Training and evaluating each model 

In [16]:
for model_name, model in models.items():
    print(f'Training {model_name}...')
    # Wrap the model in a MultiOutputClassifier to handle multilabel targets
    clf = MultiOutputClassifier(model)
    clf.fit(X_train, y_train)
    
    # Predict probabilities
    y_pred_proba = clf.predict_proba(X_val)
    
    # Extract probabilities for each label
    y_pred_proba_xyz = y_pred_proba[0][:, 1]
    y_pred_proba_seasonal = y_pred_proba[1][:, 1]
    
    # Calculate ROC AUC scores for each target
    roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba_xyz)
    roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba_seasonal)
    
    # Store the mean ROC AUC score
    roc_auc_scores[model_name] = (roc_auc_xyz + roc_auc_seasonal) / 2
    print(f'{model_name} ROC AUC: XYZ Vaccine: {roc_auc_xyz}, Seasonal Vaccine: {roc_auc_seasonal}, Mean: {roc_auc_scores[model_name]}')

Training SVM...
SVM ROC AUC: XYZ Vaccine: 0.8051544680600727, Seasonal Vaccine: 0.8553902070339992, Mean: 0.830272337547036
Training Logistic Regression...
Logistic Regression ROC AUC: XYZ Vaccine: 0.8313867248233029, Seasonal Vaccine: 0.8560581587986521, Mean: 0.8437224418109774
Training Naive Bayes...
Naive Bayes ROC AUC: XYZ Vaccine: 0.7096002571666289, Seasonal Vaccine: 0.7419565299306039, Mean: 0.7257783935486164


# Prediction and Submission

In [18]:
# Choose the best model based on the mean ROC AUC score
best_model_name = max(roc_auc_scores, key=roc_auc_scores.get)
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')

# Train the best model on the full training data
best_clf = MultiOutputClassifier(best_model)
best_clf.fit(X_preprocessed, y)

# Make predictions on the test set
y_test_pred_proba = best_clf.predict_proba(X_test_preprocessed)

# Extract probabilities for each target
y_test_pred_proba_xyz = y_test_pred_proba[0][:, 1]
y_test_pred_proba_seasonal = y_test_pred_proba[1][:, 1]

# Prepare submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred_proba_xyz,
    'seasonal_vaccine': y_test_pred_proba_seasonal
})

# Save to CSV
submission.to_csv('submission_230106072_SHREYAS_SAGAR.csv', index=False)
print('submission_230106072_SHREYAS_SAGAR.csv')

Best model: Logistic Regression
submission_230106072_SHREYAS_SAGAR.csv
