In [2]:
#!/usr/bin/env python3
#### Import all the required libraries
import pandas as pd #### Library for working with large datsets
import numpy as np #### Library for performing numerical calculations
import matplotlib.pyplot as plt #### Basic Library for plotting graphs
#### Configuring Matplotlib to show Plots inline
%matplotlib inline 
plt.rcParams['figure.figsize'] = (12, 12) ### Setting the size of the Plots

In [3]:
data = pd.read_csv('h1n1_vaccine_prediction.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  26707 non-null  int64  
 1   h1n1_worry                 26615 non-null  float64
 2   h1n1_awareness             26591 non-null  float64
 3   antiviral_medication       26636 non-null  float64
 4   contact_avoidance          26499 non-null  float64
 5   bought_face_mask           26688 non-null  float64
 6   wash_hands_frequently      26665 non-null  float64
 7   avoid_large_gatherings     26620 non-null  float64
 8   reduced_outside_home_cont  26625 non-null  float64
 9   avoid_touch_face           26579 non-null  float64
 10  dr_recc_h1n1_vacc          24547 non-null  float64
 11  dr_recc_seasonal_vacc      24547 non-null  float64
 12  chronic_medic_condition    25736 non-null  float64
 13  cont_child_undr_6_mnths    25887 non-null  flo

In [10]:
data = data[['unique_id',
'h1n1_worry',
'h1n1_awareness',
'antiviral_medication',
'contact_avoidance',
'bought_face_mask',
'wash_hands_frequently',
'avoid_large_gatherings',
'reduced_outside_home_cont',
'avoid_touch_face',
'is_h1n1_vacc_effective',
'is_h1n1_risky',
'sick_from_h1n1_vacc',
'is_seas_vacc_effective',
'is_seas_risky',
'sick_from_seas_vacc',
'age_bracket',
'race',
'sex',
'census_msa',
'no_of_adults',
'no_of_children',
'h1n1_vaccine']]


In [22]:
#### Lets ensure our data doesnt contain any Null Values
nans = lambda data: data[data.isnull().any(axis=1)]
tmp = nans(data)
data = data.drop(tmp.index,0)


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25238 entries, 0 to 26706
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  25238 non-null  int64  
 1   h1n1_worry                 25238 non-null  float64
 2   h1n1_awareness             25238 non-null  float64
 3   antiviral_medication       25238 non-null  float64
 4   contact_avoidance          25238 non-null  float64
 5   bought_face_mask           25238 non-null  float64
 6   wash_hands_frequently      25238 non-null  float64
 7   avoid_large_gatherings     25238 non-null  float64
 8   reduced_outside_home_cont  25238 non-null  float64
 9   avoid_touch_face           25238 non-null  float64
 10  is_h1n1_vacc_effective     25238 non-null  float64
 11  is_h1n1_risky              25238 non-null  float64
 12  sick_from_h1n1_vacc        25238 non-null  float64
 13  is_seas_vacc_effective     25238 non-null  flo

In [30]:
data.census_msa.unique()

array(['Non-MSA', 'MSA, Not Principle  City', 'MSA, Principle City'],
      dtype=object)

In [31]:
### In this dataset we have the dependant variable being 'yes' or 'no', lets convert it to 1 or 0
clean_up_categoricals = {'age_bracket':{'18 - 34 Years':1, '35 - 44 Years':2, '45 - 54 Years':3, '55 - 64 Years':4,
                                       '65+ Years':5},
                         'race' :{"White":1, "Black":2,"Hispanic":3,"Other or Multiple":4},
                        'sex':{'Female':0,'Male':1},
                        'census_msa':{'Non-MSA':1,'MSA, Not Principle  City':2,'MSA, Principle City':3}}
data.replace(clean_up_categoricals, inplace=True)
data.head(10)

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,age_bracket,race,sex,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,2.0,4,1,0,1,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,4.0,2.0,4.0,2,1,1,2,0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,2.0,1,1,1,2,2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,5.0,4.0,1.0,5,1,0,3,0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,3.0,1.0,4.0,3,1,0,2,1.0,0.0,0
5,5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,5.0,4.0,4.0,5,1,1,3,2.0,3.0,0
6,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,2.0,1.0,4,1,1,2,0.0,0.0,0
7,7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,4.0,2.0,1.0,3,1,0,1,2.0,0.0,1
8,8,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,4.0,2.0,1.0,3,1,1,2,1.0,0.0,0
9,9,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,4.0,2.0,2.0,4,1,1,2,0.0,0.0,0
