In [99]:
#!/usr/bin/env python3
#### Import all the required libraries
import pandas as pd #### Library for working with large datsets
import numpy as np #### Library for performing numerical calculations
import matplotlib.pyplot as plt #### Basic Library for plotting graphs
#### Configuring Matplotlib to show Plots inline
%matplotlib inline 
plt.rcParams['figure.figsize'] = (12, 12) ### Setting the size of the Plots

In [100]:
data = pd.read_csv('h1n1_vaccine_prediction.csv')

In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  26707 non-null  int64  
 1   h1n1_worry                 26615 non-null  float64
 2   h1n1_awareness             26591 non-null  float64
 3   antiviral_medication       26636 non-null  float64
 4   contact_avoidance          26499 non-null  float64
 5   bought_face_mask           26688 non-null  float64
 6   wash_hands_frequently      26665 non-null  float64
 7   avoid_large_gatherings     26620 non-null  float64
 8   reduced_outside_home_cont  26625 non-null  float64
 9   avoid_touch_face           26579 non-null  float64
 10  dr_recc_h1n1_vacc          24547 non-null  float64
 11  dr_recc_seasonal_vacc      24547 non-null  float64
 12  chronic_medic_condition    25736 non-null  float64
 13  cont_child_undr_6_mnths    25887 non-null  flo

In [102]:
data = data[['unique_id',
'h1n1_worry',
'h1n1_awareness',
'antiviral_medication',
'contact_avoidance',
'bought_face_mask',
'wash_hands_frequently',
'avoid_large_gatherings',
'reduced_outside_home_cont',
'avoid_touch_face',
'is_h1n1_vacc_effective',
'is_h1n1_risky',
'sick_from_h1n1_vacc',
'is_seas_vacc_effective',
'is_seas_risky',
'sick_from_seas_vacc',
'age_bracket',
'race',
'sex',
'census_msa',
'no_of_adults',
'no_of_children',
'h1n1_vaccine']]


In [103]:
#### Lets ensure our data doesnt contain any Null Values
nans = lambda data: data[data.isnull().any(axis=1)]
tmp = nans(data)
data = data.drop(tmp.index,0)


In [104]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25238 entries, 0 to 26706
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  25238 non-null  int64  
 1   h1n1_worry                 25238 non-null  float64
 2   h1n1_awareness             25238 non-null  float64
 3   antiviral_medication       25238 non-null  float64
 4   contact_avoidance          25238 non-null  float64
 5   bought_face_mask           25238 non-null  float64
 6   wash_hands_frequently      25238 non-null  float64
 7   avoid_large_gatherings     25238 non-null  float64
 8   reduced_outside_home_cont  25238 non-null  float64
 9   avoid_touch_face           25238 non-null  float64
 10  is_h1n1_vacc_effective     25238 non-null  float64
 11  is_h1n1_risky              25238 non-null  float64
 12  sick_from_h1n1_vacc        25238 non-null  float64
 13  is_seas_vacc_effective     25238 non-null  flo

In [105]:
data.census_msa.unique()

array(['Non-MSA', 'MSA, Not Principle  City', 'MSA, Principle City'],
      dtype=object)

In [106]:
### In this dataset we have the dependant variable being 'yes' or 'no', lets convert it to 1 or 0
clean_up_categoricals = {'age_bracket':{'18 - 34 Years':1, '35 - 44 Years':2, '45 - 54 Years':3, '55 - 64 Years':4,
                                       '65+ Years':5},
                         'race' :{"White":1, "Black":2,"Hispanic":3,"Other or Multiple":4},
                        'sex':{'Female':0,'Male':1},
                        'census_msa':{'Non-MSA':1,'MSA, Not Principle  City':2,'MSA, Principle City':3}}
data.replace(clean_up_categoricals, inplace=True)
data.head(10)

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,age_bracket,race,sex,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,2.0,4,1,0,1,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,4.0,2.0,4.0,2,1,1,2,0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,2.0,1,1,1,2,2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,5.0,4.0,1.0,5,1,0,3,0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,3.0,1.0,4.0,3,1,0,2,1.0,0.0,0
5,5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,5.0,4.0,4.0,5,1,1,3,2.0,3.0,0
6,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,2.0,1.0,4,1,1,2,0.0,0.0,0
7,7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,4.0,2.0,1.0,3,1,0,1,2.0,0.0,1
8,8,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,4.0,2.0,1.0,3,1,1,2,1.0,0.0,0
9,9,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,4.0,2.0,2.0,4,1,1,2,0.0,0.0,0


In [107]:
#### Extract Y & X from Data
y = data['h1n1_vaccine'].values
X = data
X = X.drop(['unique_id','h1n1_vaccine'],1)
#### Extract Column Names
names = X.columns

In [108]:
#### Feature Scaling - Lets scale the Features to ensure they are all in the same range
# Feature Scaling

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X)


In [112]:
#### Run RFE to find the top 5 features to use
from sklearn.feature_selection import RFE
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 5)
fit = rfe.fit(X, y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)
Selcted_features = fit.support_
RANKS = fit.ranking_
Selected = []
for i,v in enumerate(Selcted_features):
    if(v==True):
        print(names[i], RANKS[i])
        Selected.append(names[i])



Num Features: 5
Selected Features: [False  True False False  True False  True False False  True  True False
 False False False False False False False False False]
Feature Ranking: [ 5  1  3 13  1 12  1  9  8  1  1 14 10  2  6  4 17  7 15 11 16]
h1n1_awareness 1
bought_face_mask 1
avoid_large_gatherings 1
is_h1n1_vacc_effective 1
is_h1n1_risky 1


In [113]:
#### Extract Y & X from Data
Y = data['h1n1_vaccine'].values
X = data[Selected]

#### Split X & Y into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [114]:
#### Feature Scaling - Lets scale the Features to ensure they are all in the same range
# Feature Scaling

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [115]:
# Fitting Logistic Regression Classifier to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)

In [116]:
#### Fit the classifier to the training set
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [117]:
### Predict the results
y_pred = classifier.predict(X_test)

In [118]:
### Predict the result for a custom threshold
THRESHOLD = 0.18
y_pred1 = np.where(classifier.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

In [119]:
### Lets create a Confusion Matrix to See how valid our accuracy score is
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
labels =['Pr 0', 'Pr 1']
print(*labels)
for line in cm:
    print(*line)

print("###################-Custom Threshold")
cm1 = confusion_matrix(y_test, y_pred1)
labels =['Pr 0', 'Pr 1']
print(*labels)
for line in cm1:
    print(*line)

Pr 0 Pr 1
3742 239
756 311
###################-Custom Threshold
Pr 0 Pr 1
2505 1476
251 816


In [120]:
#### Lets take a look at the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
print("###################-Custom Threshold")
print(classification_report(y_test, y_pred1))


              precision    recall  f1-score   support

           0       0.83      0.94      0.88      3981
           1       0.57      0.29      0.38      1067

    accuracy                           0.80      5048
   macro avg       0.70      0.62      0.63      5048
weighted avg       0.78      0.80      0.78      5048

###################-Custom Threshold
              precision    recall  f1-score   support

           0       0.91      0.63      0.74      3981
           1       0.36      0.76      0.49      1067

    accuracy                           0.66      5048
   macro avg       0.63      0.70      0.61      5048
weighted avg       0.79      0.66      0.69      5048



In [98]:
#### Lets take a look at the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
print("###################-Custom Threshold")
print(classification_report(y_test, y_pred1))


              precision    recall  f1-score   support

           0       0.83      0.95      0.88      3981
           1       0.58      0.27      0.37      1067

    accuracy                           0.80      5048
   macro avg       0.70      0.61      0.63      5048
weighted avg       0.78      0.80      0.78      5048

###################-Custom Threshold
              precision    recall  f1-score   support

           0       0.91      0.65      0.76      3981
           1       0.37      0.77      0.50      1067

    accuracy                           0.67      5048
   macro avg       0.64      0.71      0.63      5048
weighted avg       0.80      0.67      0.70      5048

