In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("h1n1_vaccine_prediction.csv")

# Data Exploration

In [3]:
df.shape

(26707, 34)

In [4]:
df.dtypes

unique_id                      int64
h1n1_worry                   float64
h1n1_awareness               float64
antiviral_medication         float64
contact_avoidance            float64
bought_face_mask             float64
wash_hands_frequently        float64
avoid_large_gatherings       float64
reduced_outside_home_cont    float64
avoid_touch_face             float64
dr_recc_h1n1_vacc            float64
dr_recc_seasonal_vacc        float64
chronic_medic_condition      float64
cont_child_undr_6_mnths      float64
is_health_worker             float64
has_health_insur             float64
is_h1n1_vacc_effective       float64
is_h1n1_risky                float64
sick_from_h1n1_vacc          float64
is_seas_vacc_effective       float64
is_seas_risky                float64
sick_from_seas_vacc          float64
age_bracket                   object
qualification                 object
race                          object
sex                           object
income_level                  object
m

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  26707 non-null  int64  
 1   h1n1_worry                 26615 non-null  float64
 2   h1n1_awareness             26591 non-null  float64
 3   antiviral_medication       26636 non-null  float64
 4   contact_avoidance          26499 non-null  float64
 5   bought_face_mask           26688 non-null  float64
 6   wash_hands_frequently      26665 non-null  float64
 7   avoid_large_gatherings     26620 non-null  float64
 8   reduced_outside_home_cont  26625 non-null  float64
 9   avoid_touch_face           26579 non-null  float64
 10  dr_recc_h1n1_vacc          24547 non-null  float64
 11  dr_recc_seasonal_vacc      24547 non-null  float64
 12  chronic_medic_condition    25736 non-null  float64
 13  cont_child_undr_6_mnths    25887 non-null  flo

# Missing Values

In [6]:
df.isnull().sum()

unique_id                        0
h1n1_worry                      92
h1n1_awareness                 116
antiviral_medication            71
contact_avoidance              208
bought_face_mask                19
wash_hands_frequently           42
avoid_large_gatherings          87
reduced_outside_home_cont       82
avoid_touch_face               128
dr_recc_h1n1_vacc             2160
dr_recc_seasonal_vacc         2160
chronic_medic_condition        971
cont_child_undr_6_mnths        820
is_health_worker               804
has_health_insur             12274
is_h1n1_vacc_effective         391
is_h1n1_risky                  388
sick_from_h1n1_vacc            395
is_seas_vacc_effective         462
is_seas_risky                  514
sick_from_seas_vacc            537
age_bracket                      0
qualification                 1407
race                             0
sex                              0
income_level                  4423
marital_status                1408
housing_status      

In [7]:
df=df.drop(["has_health_insur","income_level","dr_recc_h1n1_vacc","dr_recc_seasonal_vacc"],axis=1)

In [9]:
df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,qualification,race,sex,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,< 12 Years,White,Female,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,12 Years,White,Male,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,College Graduate,White,Male,Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,12 Years,White,Female,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Some College,White,Female,Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0


In [15]:
df=df.dropna()

In [16]:
df.shape

(23426, 30)

In [17]:
df.isnull().sum()

unique_id                    0
h1n1_worry                   0
h1n1_awareness               0
antiviral_medication         0
contact_avoidance            0
bought_face_mask             0
wash_hands_frequently        0
avoid_large_gatherings       0
reduced_outside_home_cont    0
avoid_touch_face             0
chronic_medic_condition      0
cont_child_undr_6_mnths      0
is_health_worker             0
is_h1n1_vacc_effective       0
is_h1n1_risky                0
sick_from_h1n1_vacc          0
is_seas_vacc_effective       0
is_seas_risky                0
sick_from_seas_vacc          0
age_bracket                  0
qualification                0
race                         0
sex                          0
marital_status               0
housing_status               0
employment                   0
census_msa                   0
no_of_adults                 0
no_of_children               0
h1n1_vaccine                 0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23426 entries, 0 to 26706
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  23426 non-null  int64  
 1   h1n1_worry                 23426 non-null  float64
 2   h1n1_awareness             23426 non-null  float64
 3   antiviral_medication       23426 non-null  float64
 4   contact_avoidance          23426 non-null  float64
 5   bought_face_mask           23426 non-null  float64
 6   wash_hands_frequently      23426 non-null  float64
 7   avoid_large_gatherings     23426 non-null  float64
 8   reduced_outside_home_cont  23426 non-null  float64
 9   avoid_touch_face           23426 non-null  float64
 10  chronic_medic_condition    23426 non-null  float64
 11  cont_child_undr_6_mnths    23426 non-null  float64
 12  is_health_worker           23426 non-null  float64
 13  is_h1n1_vacc_effective     23426 non-null  flo

# Encoding

In [19]:
df=pd.get_dummies(columns=["age_bracket","qualification","race","sex","marital_status","housing_status","employment","census_msa"],data=df)

In [20]:
df["h1n1_vaccine"].value_counts

<bound method IndexOpsMixin.value_counts of 0        0
1        0
2        0
3        0
4        0
        ..
26700    0
26701    0
26702    0
26703    0
26706    0
Name: h1n1_vaccine, Length: 23426, dtype: int64>

In [21]:
x=df.drop(["h1n1_vaccine"],axis=1)
y=df["h1n1_vaccine"]

# Spliting the data

In [22]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=11)

# Logistic Regression

In [23]:
model_1=LogisticRegression()

In [24]:
model_1.fit(xtrain,ytrain)

In [25]:
model_1.score(xtrain,ytrain)

0.7997316745944627

In [26]:
pred=model_1.predict(xtest)

In [27]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [28]:
accuracy_score(ytest,pred)

0.8020774046670461

In [29]:
accuracy_score(ytest,pred)*100

80.2077404667046

In [31]:
comp=pd.DataFrame({'actual value':ytest, 'predicted value':pred})
comp

Unnamed: 0,actual value,predicted value
24002,0,0
53,0,0
706,0,0
26059,0,0
7814,0,0
...,...,...
20110,1,0
7919,1,0
1511,0,0
13978,0,0


# Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier
model_2=DecisionTreeClassifier(max_depth=3)
model_2.fit(xtrain,ytrain)

In [33]:
model_2.score(xtrain,ytrain)

0.7999756067813147

In [34]:
model_2.score(xtest,ytest)

0.8091918042117245

In [35]:
pred2=model_2.predict(xtest)

In [37]:
accuracy_score(ytest,pred2)*100

80.91918042117246

# Bagging Tree

In [39]:
from sklearn.ensemble import BaggingClassifier
model_3=BaggingClassifier(n_estimators=90,base_estimator=model_2)
model_3.fit(xtrain,ytrain)

In [40]:
model_3.score(xtrain,ytrain)

0.8080253689474326

In [41]:
model_3.score(xtest,ytest)

0.8180136596471258

In [42]:
pred3=model_3.predict(xtest)

In [43]:
accuracy_score(ytest,pred3)*100

81.80136596471257

# Ada Boosting

In [44]:
from sklearn.ensemble import AdaBoostClassifier
model_4=AdaBoostClassifier(n_estimators=27)
model_4.fit(xtrain,ytrain)

In [45]:
model_4.score(xtrain,ytrain)

0.8114404195633614

In [46]:
pred4=model_4.predict(xtest)

In [47]:
accuracy_score(ytest,pred4)*100

81.85828116107

# Gradient Boosting

In [48]:
from sklearn.ensemble import GradientBoostingClassifier
model_5=GradientBoostingClassifier(n_estimators=27)
model_5.fit(xtrain,ytrain)

In [49]:
model_5.score(xtrain,ytrain)

0.8144285888522991

In [50]:
pred5=model_5.predict(xtest)

In [51]:
accuracy_score(ytest,pred5)*100

81.90096755833808

# Random Forest

In [53]:
from sklearn.ensemble import RandomForestClassifier
model_6=RandomForestClassifier(n_estimators=50)
model_6.fit(xtrain,ytrain)

In [54]:
model_6.score(xtrain,ytrain)

0.9995121356262959

In [55]:
model_6.score(xtest,ytest)

0.8168753557199773

In [56]:
pred6=model_6.predict(xtest)

In [57]:
accuracy_score(ytest,pred6)*100

81.68753557199773