In [2]:
#Importing required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [3]:
#Loading the datasets
training_set_labels = pd.read_csv('/content/training_set_labels.csv')
training_set_features = pd.read_csv('/content/training_set_features.csv')
test_set_features = pd.read_csv('/content/test_set_features.csv')

In [4]:
print(training_set_labels.shape)
training_set_labels

(26707, 3)


Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0
...,...,...,...
26702,26702,0,0
26703,26703,0,0
26704,26704,0,1
26705,26705,0,0


In [5]:
print(training_set_features.shape)
training_set_features

(26707, 36)


Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [6]:
training_set_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [7]:
training_set_labels = training_set_labels.dropna(subset=['xyz_vaccine', 'seasonal_vaccine'])
training_set_features = training_set_features.loc[training_set_labels.index]
training_set_features.shape

(26707, 36)

In [8]:
X_train = training_set_features.drop(columns=['respondent_id'])
y_xyz = training_set_labels['xyz_vaccine']
y_seasonal = training_set_labels['seasonal_vaccine']
X_test = test_set_features.drop(columns=['respondent_id'])


In [9]:
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=[np.number]).columns

In [10]:
categorical_features

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation'],
      dtype='object')

In [11]:
numerical_features

Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children'],
      dtype='object')

In [12]:
encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(pd.DataFrame(X_train_imputed, columns=X_train.columns)[categorical_features])
X_test_encoded = encoder.transform(pd.DataFrame(X_test_imputed, columns=X_test.columns)[categorical_features])

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(pd.DataFrame(X_train_imputed, columns=X_train.columns)[numerical_features])
X_test_scaled = scaler.transform(pd.DataFrame(X_test_imputed, columns=X_test.columns)[numerical_features])

In [14]:
X_train_preprocessed = np.hstack((X_train_scaled, X_train_encoded))
X_test_preprocessed = np.hstack((X_test_scaled, X_test_encoded))

In [15]:
logreg_xyz = LogisticRegression(random_state=42, max_iter=1000)
logreg_seasonal = LogisticRegression(random_state=42, max_iter=1000)
logreg_xyz.fit(X_train_preprocessed, y_xyz)
logreg_seasonal.fit(X_train_preprocessed, y_seasonal)

In [16]:
y_xyz_train_pred_prob= logreg_xyz.predict_proba(X_train_preprocessed)[:, 1]
y_seasonal_train_pred_prob = logreg_seasonal.predict_proba(X_train_preprocessed)[:, 1]

In [17]:
xyz_auc_roc = roc_auc_score(y_xyz, y_xyz_train_pred_prob)
seasonal_auc_roc = roc_auc_score(y_seasonal, y_seasonal_train_pred_prob)

In [18]:
print(f'xyz Vaccine AUC-ROC: {xyz_auc_roc:.4f}')
print(f'Seasonal Vaccine AUC-ROC: {seasonal_auc_roc:.4f}')

xyz Vaccine AUC-ROC: 0.8380
Seasonal Vaccine AUC-ROC: 0.8559


In [19]:
y_xyz_test_pred_prob = logreg_xyz.predict_proba(X_test_preprocessed)[:, 1]
y_seasonal_test_pred_prob = logreg_seasonal.predict_proba(X_test_preprocessed)[:, 1]

In [22]:
submission = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': y_xyz_test_pred_prob,
    'seasonal_vaccine':y_seasonal_test_pred_prob
})

In [23]:
submission.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.055649,0.293574
1,26708,0.047062,0.045307
2,26709,0.410282,0.59106
3,26710,0.49847,0.880997
4,26711,0.162588,0.463592
