###Subjects receiving the same vaccine often show different levels of immune responses 
and some may even present adverse side effects to the vaccine. Systems vaccinology can combine omics data and machine learning techniques to obtain highly predictive signatures of vaccine immunogenicity and reactogenicity. Currently, several machine learning methods are already available to researchers with no background in bioinformatics.

#Problem Statement:

###Predict how likely it is that the people will take an H1N1 flu vaccine using Logistic Regression.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from statsmodels.tools.eval_measures import rmse
from scipy.stats import shapiro
from statsmodels.stats.outliers_influence import variance_inflation_factor
from warnings import filterwarnings

In [3]:
filterwarnings("ignore")
pd.options.display.max_columns = None

# Data Import

In [4]:
df = pd.read_csv("h1n1_vaccine_prediction.csv")
df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,dr_recc_h1n1_vacc,dr_recc_seasonal_vacc,chronic_medic_condition,cont_child_undr_6_mnths,is_health_worker,has_health_insur,is_h1n1_vacc_effective,is_h1n1_risky,sick_from_h1n1_vacc,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,age_bracket,qualification,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0


# Data Cleaning

## No of records

In [5]:
len(df)

26707

## Null Values

In [6]:
df.isnull().sum()

unique_id                        0
h1n1_worry                      92
h1n1_awareness                 116
antiviral_medication            71
contact_avoidance              208
bought_face_mask                19
wash_hands_frequently           42
avoid_large_gatherings          87
reduced_outside_home_cont       82
avoid_touch_face               128
dr_recc_h1n1_vacc             2160
dr_recc_seasonal_vacc         2160
chronic_medic_condition        971
cont_child_undr_6_mnths        820
is_health_worker               804
has_health_insur             12274
is_h1n1_vacc_effective         391
is_h1n1_risky                  388
sick_from_h1n1_vacc            395
is_seas_vacc_effective         462
is_seas_risky                  514
sick_from_seas_vacc            537
age_bracket                      0
qualification                 1407
race                             0
sex                              0
income_level                  4423
marital_status                1408
housing_status      

## Filling null values with mode

In [7]:
df['h1n1_worry'].fillna(float(df.h1n1_worry.mode()),inplace=True)

In [8]:
df['h1n1_awareness'].fillna(float(df.h1n1_awareness.mode()),inplace=True)

In [9]:
df['antiviral_medication'].fillna(float(df.antiviral_medication.mode()),inplace=True)

In [10]:
df['contact_avoidance'].fillna(float(df.contact_avoidance.mode()),inplace=True)

In [11]:
df['bought_face_mask'].fillna(float(df.bought_face_mask.mode()),inplace=True)

In [12]:
df['wash_hands_frequently'].fillna(float(df.wash_hands_frequently.mode()),inplace=True)

In [13]:
df['avoid_large_gatherings'].fillna(float(df.avoid_large_gatherings.mode()),inplace=True)

In [14]:
df['reduced_outside_home_cont'].fillna(float(df.reduced_outside_home_cont.mode()),inplace=True)

In [15]:
df['avoid_touch_face'].fillna(float(df.avoid_touch_face.mode()),inplace=True)

In [16]:
df['dr_recc_h1n1_vacc'].fillna(float(df.dr_recc_h1n1_vacc.mode()),inplace=True)

In [17]:
df['dr_recc_seasonal_vacc'].fillna(float(df.dr_recc_seasonal_vacc.mode()),inplace=True)

In [18]:
df['chronic_medic_condition'].fillna(float(df.chronic_medic_condition.mode()),inplace=True)

In [19]:
df['cont_child_undr_6_mnths'].fillna(float(df.cont_child_undr_6_mnths.mode()),inplace=True)

In [20]:
df['is_health_worker'].fillna(float(df.is_health_worker.mode()),inplace=True)

## Dropping column due to very large number of missing values

In [21]:
df = df.drop('has_health_insur', 1)

## Null Check

In [22]:
df.isnull().sum()

unique_id                       0
h1n1_worry                      0
h1n1_awareness                  0
antiviral_medication            0
contact_avoidance               0
bought_face_mask                0
wash_hands_frequently           0
avoid_large_gatherings          0
reduced_outside_home_cont       0
avoid_touch_face                0
dr_recc_h1n1_vacc               0
dr_recc_seasonal_vacc           0
chronic_medic_condition         0
cont_child_undr_6_mnths         0
is_health_worker                0
is_h1n1_vacc_effective        391
is_h1n1_risky                 388
sick_from_h1n1_vacc           395
is_seas_vacc_effective        462
is_seas_risky                 514
sick_from_seas_vacc           537
age_bracket                     0
qualification                1407
race                            0
sex                             0
income_level                 4423
marital_status               1408
housing_status               2042
employment                   1463
census_msa    

In [23]:
df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,dr_recc_h1n1_vacc,dr_recc_seasonal_vacc,chronic_medic_condition,cont_child_undr_6_mnths,is_health_worker,is_h1n1_vacc_effective,is_h1n1_risky,sick_from_h1n1_vacc,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,age_bracket,qualification,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,3.0,5.0,5.0,4.0,1.0,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0


## Replacing missing values again

In [24]:
df['is_h1n1_vacc_effective'].fillna(float(df.is_h1n1_vacc_effective.mode()),inplace=True)

In [25]:
df['is_h1n1_risky'].fillna(float(df.is_h1n1_risky.mode()),inplace=True)

In [26]:
df['sick_from_h1n1_vacc'].fillna(float(df.sick_from_h1n1_vacc.mode()),inplace=True)

In [27]:
df['is_seas_vacc_effective'].fillna(float(df.is_seas_vacc_effective.mode()),inplace=True)

In [28]:
df['is_seas_risky'].fillna(float(df.is_seas_risky.mode()),inplace=True)

In [29]:
df['sick_from_seas_vacc'].fillna(float(df.sick_from_seas_vacc.mode()),inplace=True)

## Replacing categorical data with float

In [30]:
df["qualification"].replace({"< 12 Years": "1.0", "12 Years": "2.0", "Some College":"3.0", "College Graduate":"4.0"}, inplace=True)

In [31]:
df['qualification'].fillna(float(df.qualification.mode()),inplace=True)    

In [32]:
df["age_bracket"].replace({"18 - 34 Years": "1.0", "35 - 44 Years": "2.0", "45 - 54 Years":"3.0", "55 - 64 Years":"4.0", "65+ Years":"5.0"}, inplace=True)

In [33]:
df['age_bracket'].fillna(float(df.age_bracket.mode()),inplace=True)    

In [34]:
df["income_level"].replace({"Below Poverty": "1.0", "<= $75,000, Above Poverty": "2.0", "> $75,000":"3.0"}, inplace=True)

In [35]:
df['income_level'].fillna(float(df.income_level.mode()),inplace=True)

In [36]:
df["marital_status"].replace({"Married": "2.0", "Not Married": "1.0"}, inplace=True)

In [37]:
df['marital_status'].fillna(float(df.marital_status.mode()),inplace=True)

In [38]:
df["housing_status"].replace({"Own": "2.0", "Rent": "1.0"}, inplace=True)

In [39]:
df['housing_status'].fillna(float(df.housing_status.mode()),inplace=True)

In [40]:
df["employment"].replace({"Not in Labor Force": "1.0", "Unemployed": "2.0", "Employed" : "3.0"}, inplace=True)

In [41]:
df['employment'].fillna(float(df.employment.mode()),inplace=True)

In [42]:
df["race"].replace({"White": "1.0", "Black": "2.0", "Other or Multiple" : "3.0","Hispanic":"4.0"}, inplace=True)

In [43]:
df['race'].fillna(float(df.race.mode()),inplace=True)

In [44]:
df["sex"].replace({"Female": "1.0", "Male": "2.0"}, inplace=True)

In [45]:
df['sex'].fillna(float(df.sex.mode()),inplace=True)

In [46]:
df['no_of_adults'].fillna(float(df.no_of_adults.mode()),inplace=True)

In [47]:
df['no_of_children'].fillna(float(df.no_of_children.mode()),inplace=True)

In [48]:
df.isnull().sum()

unique_id                    0
h1n1_worry                   0
h1n1_awareness               0
antiviral_medication         0
contact_avoidance            0
bought_face_mask             0
wash_hands_frequently        0
avoid_large_gatherings       0
reduced_outside_home_cont    0
avoid_touch_face             0
dr_recc_h1n1_vacc            0
dr_recc_seasonal_vacc        0
chronic_medic_condition      0
cont_child_undr_6_mnths      0
is_health_worker             0
is_h1n1_vacc_effective       0
is_h1n1_risky                0
sick_from_h1n1_vacc          0
is_seas_vacc_effective       0
is_seas_risky                0
sick_from_seas_vacc          0
age_bracket                  0
qualification                0
race                         0
sex                          0
income_level                 0
marital_status               0
housing_status               0
employment                   0
census_msa                   0
no_of_adults                 0
no_of_children               0
h1n1_vac

In [49]:
df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,dr_recc_h1n1_vacc,dr_recc_seasonal_vacc,chronic_medic_condition,cont_child_undr_6_mnths,is_health_worker,is_h1n1_vacc_effective,is_h1n1_risky,sick_from_h1n1_vacc,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,age_bracket,qualification,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,4.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,4.0,4.0,2.0,4.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,3.0,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,4.0,1.0,2.0,1.0,4.0,1.0,2.0,2.0,1.0,2.0,3.0,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,3.0,5.0,5.0,4.0,1.0,5.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,3.0,1.0,4.0,3.0,3.0,1.0,1.0,2.0,2.0,2.0,3.0,"MSA, Not Principle City",1.0,0.0,0


In [50]:
df.h1n1_vaccine.isnull().sum()

0

In [51]:
df = df.drop('census_msa', 1)

# Modeling

In [52]:
X=df.drop("h1n1_vaccine",axis=1)
y=df["h1n1_vaccine"]

In [53]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.70, random_state=3)

In [54]:
model=LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

In [55]:
predictions=model.predict(X_test)

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.8048141214228404

In [57]:
from sklearn import metrics

In [58]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88     14735
           1       0.58      0.30      0.39      3960

    accuracy                           0.80     18695
   macro avg       0.70      0.62      0.64     18695
weighted avg       0.78      0.80      0.78     18695



In [59]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[13860,   875],
       [ 2774,  1186]], dtype=int64)