### Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score



### Loading Datasets

In [3]:
Ftrs_Train=pd.read_csv("training_set_features.csv",index_col=[0])
Labels_Train=pd.read_csv("training_set_labels.csv",index_col=[0])
Ftrs_Test=pd.read_csv("test_set_features.csv",index_col=[0])
Ftrs_Train.columns

Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

### Analysing Data and Its Types

In [138]:
Ftrs_Train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26707 entries, 0 to 26706
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   xyz_concern                  26615 non-null  float64
 1   xyz_knowledge                26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_xyz              24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker        

In [139]:
Ftrs_Train.isna().sum()

xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

### Cleaning Data

In [4]:
#Dropping columns as they have large number of NaN values
Ftrs_Train.drop(columns=['employment_occupation','employment_industry'],inplace=True)

#Filling NaN values with most frequent class
Ftrs_Train=Ftrs_Train.apply(lambda x: x.fillna(x.value_counts().index[0]))
Ftrs_Train.head()


Unnamed: 0_level_0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0


### Categorial Encoding

In [5]:
from sklearn.preprocessing import OrdinalEncoder
object_column=['age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region','census_msa']

for column in object_column:
    encoder=OrdinalEncoder()
    Ftrs_Train[column]=encoder.fit_transform(Ftrs_Train[[column]])

Ftrs_Train.head(20)

Unnamed: 0_level_0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,3.0,0.0,2.0,1.0,0.0,1.0,8.0,2.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,3.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,3.0,0.0,2.0,1.0,1.0,1.0,5.0,1.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,1.0,0.0
5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,0.0,0.0
7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0
8,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,3.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,3.0,1.0,0.0,1.0,0.0,1.0,9.0,0.0,0.0,0.0


In [32]:
def fit_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict_proba(X_test)[:,1]
    return roc_auc_score(y_test,predictions)


### Kfolds Cross Validation to choose best Model

In [10]:
from sklearn.model_selection import StratifiedKFold
kf=StratifiedKFold(n_splits=4)

score_logreg=[]
score_nb=[]
score_SVM=[]

for train_index, test_index in kf.split(Ftrs_Train,Labels_Train['xyz_vaccine']):

    X_train, X_test, y_train, y_test=Ftrs_Train.iloc[train_index], Ftrs_Train.iloc[test_index], Labels_Train.iloc[train_index,0] ,Labels_Train.iloc[test_index,0]

    score_logreg.append(fit_score(LogisticRegression(max_iter=1000,solver='lbfgs'),
                                  X_train, X_test, y_train, y_test))
    print('.')
    score_nb.append(fit_score(GaussianNB(),
                                  X_train, X_test, y_train, y_test))
    print('.')
    score_SVM.append(fit_score(SVC(probability=True,C=1.0, kernel='rbf', gamma='scale'),
                                  X_train, X_test, y_train, y_test))
    print('.')


print("Avg ROC AUC Score for Logistic Regression :",np.average(score_logreg))
print("Avg ROC AUC Score for Naive Bayes :",np.average(score_nb))
print("Avg ROC AUC Score for SVM :",np.average(score_SVM))

.
.
.
.
.
.
.
.
.
.
.
.
Avg ROC AUC Score for Logistic Regression : 0.8285117097074851
Avg ROC AUC Score for Naive Bayes : 0.7861034466637946
Avg ROC AUC Score for SVM : 0.8153219538022243


Logistic Regression turns out to be the best model