## EDA for data understanding and visualisations

In [None]:
# Importing the relevant libraries

import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

pd.set_option('display.max_columns', None)

RSEED = 42
# Modeling Libraries

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # pip install plotly needs to executed
import plotly.graph_objects as go

In [None]:
# We need to add plotly to our requirements

In [None]:
# Reading in the data 
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_1.csv')

In [None]:
df.info()

In [None]:
# Looking at the distribution of our target variables 

print(df.h1n1_vaccine.value_counts(normalize=True))
print(df.seasonal_vaccine.value_counts(normalize=True))

In [None]:
sns.countplot(x='h1n1_vaccine', data=df)

In [None]:
# Creating plots for the distribution of our target variables 

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11,3))

df['h1n1_vaccine'].value_counts(normalize=True).plot(kind='bar', ax=ax1, color=['dimgrey', 'forestgreen'], rot=0)
ax1.set_title('Distribution of H1N1 vaccinations (%)')
ax1.set_xlabel('')
ax1.set_xticklabels(['Not vaccinated', 'Vaccinated'])
ax1.set_ylim([0,1]) # setting the limits for the y-axis

df['seasonal_vaccine'].value_counts(normalize=True).plot(kind='bar', ax=ax2, color=['dimgrey', 'forestgreen'], rot=0)
ax2.set_title('Distribution of seasonal flu vaccinations (%)')
ax2.set_xlabel('')
ax2.set_xticklabels(['Not vaccinated', 'Vaccinated'])
ax2.set_ylim([0,1]) # setting the limits for the y-axis

fig.subplots_adjust(hspace=0.3)
plt.show()

In [None]:
df.groupby('h1n1_vaccine')['seasonal_vaccine'].value_counts(sort=False)

In [None]:
# Creating plots for the distribution of overlaps between H1N1 and seasonal flu vaccines

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,3))

df.groupby('h1n1_vaccine').seasonal_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', color=['dimgrey', 'forestgreen'], ax=ax1,  rot=0)
ax1.set_title('Seasonal vaccinations by H1N1 vaccine status')
ax1.set_xlabel('')
ax1.set_xticklabels(['Not vaccinated', 'Vaccinated', 'Not vaccinated', 'Vaccinated'])
ax1.set_ylim([0,1])

df.groupby('seasonal_vaccine').h1n1_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, color=['dimgrey', 'forestgreen'], rot=0)
ax2.set_title('H1N1 vaccinations by seasonal vaccine status')
ax2.set_xlabel('')
ax2.set_xticklabels(['Not vaccinated', 'Vaccinated', 'Not vaccinated', 'Vaccinated'])
ax2.set_ylim([0,1])

fig.subplots_adjust(hspace=0.3)
plt.show()

- The share of people with seasonal flu vaccination is higher among people that did get an H1N1 vaccine.
- The share of people with H1N1 vaccination is higher among people that also go the seasonal flu vaccine.
- The graphs show that there seems to be a correlation between both types of vaccinations. 

- The graphs are not self explanatory. There needs to be a better indication. 

## Influence of doctor's recommendations 

In [None]:
# Creating plots for showing if vaccinations have been recommended by a doctor

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,3))

df.groupby('doctor_recc_h1n1').h1n1_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', color=['dimgrey', 'forestgreen'], ax=ax1,  rot=0)
ax1.set_title('H1N1 vaccine status by doctor recommendation')
ax1.set_xlabel('')
ax1.set_xticklabels(['Not vaccinated', 'Vaccinated', 'Not vaccinated', 'Vaccinated'])
ax1.set_ylim([0,1])

df.groupby('doctor_recc_seasonal').seasonal_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, color=['dimgrey', 'forestgreen'], rot=0)
ax2.set_title('Seasonal vaccine status by doctor recommendation')
ax2.set_xlabel('')
ax2.set_xticklabels(['-reco / -vacc', '-reco / +vacc', '+reco / -vacc', '+reco / +vacc'])
ax2.set_ylim([0,1])
# ax2.legend(title='color', bbox_to_anchor=(1.05, 1), loc='upper left')

fig.subplots_adjust(hspace=0.3)
plt.show()

- Recommendations of the respective vaccine are important for getting the vaccine.
- Only a small proportion of people that did not get a recommendation, got the vaccine. 
- For seasonal flu vaccinations, the share of people that did not get a recommendation from a doctor is higher than for H1N1.

- In order to get some of the columns into Categorical mode, they will be changed.
- Otherwise, plots will adapt different strategies for displaying the order of them (sort=False does not work)

In [None]:
df['opinion_h1n1_risk'] = pd.Categorical(df['opinion_h1n1_risk'], ordered=True)
df['opinion_seas_risk'] = pd.Categorical(df['opinion_seas_risk'], ordered=True)

df['h1n1_concern'] = pd.Categorical(df['h1n1_concern'], ordered=True)
df['h1n1_knowledge'] = pd.Categorical(df['h1n1_knowledge'], ordered=True)

### Investigating Vaccinations by risk awareness

In [None]:
# Splitting the dataset into vaccinated and non vaccinated for each vaccine type
plot_h1n1_no = df.loc[df['h1n1_vaccine']== 0]
plot_h1n1_yes = df.loc[df['h1n1_vaccine']== 1]
plot_seasonal_no = df.loc[df['seasonal_vaccine']== 0]
plot_seasonal_yes = df.loc[df['seasonal_vaccine']== 1]

fig, ((ax1, ax2), (ax3, ax4))  = plt.subplots(2, 2, figsize=(15,6))

plot_h1n1_no['opinion_h1n1_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax1, rot=0, color='dimgrey')
ax1.set_title('Non H1N1 vaccinated: Opinion of risk (%)')
ax1.set_xlabel('')
ax1.set_ylim([0,1]) # setting the limits for the y-axis

plot_h1n1_yes['opinion_h1n1_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, rot=0, color='forestgreen')
ax2.set_title('H1N1 vaccinated: Opinion of risk (%)')
ax2.set_xlabel('')
ax2.set_ylim([0,1]) # setting the limits for the y-axis

plot_seasonal_no['opinion_seas_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax3, rot=0, color='dimgrey')
ax3.set_title('Seasonal non vaccinated: Opinion of risk (%)')
ax3.set_xlabel('')
ax3.set_ylim([0,1]) # setting the limits for the y-axis

plot_seasonal_yes['opinion_seas_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax4, rot=0, color='forestgreen')
ax4.set_title('Seasonal vaccinated: Opinion of risk  (%)')
ax4.set_xlabel('')
ax4.set_ylim([0,1]) # setting the limits for the y-axis

fig.subplots_adjust(hspace=0.3)
plt.show()

In [None]:
# Plotting the overall risk perception for H1N1 and for seasonal flu

fig, (ax1, ax2)  = plt.subplots(1, 2, figsize=(15,5))

df['opinion_h1n1_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax1, rot=0, color=['peachpuff', 'lightpink', 'lightgray', 'mediumorchid', 'indigo']
)
ax1.set_title('Perceived risk of H1N1 (%)')
ax1.set_xlabel('')
ax1.set_xticklabels(["1 - Very low", "2 - Somewhat low", "3 - Don't_know", "4 - Somewhat high",
                                 "5 - Very high"], fontsize='small')
ax1.set_ylim([0,0.45]) # setting the limits for the y-axis

df['opinion_seas_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, rot=0, color=['peachpuff', 'lightpink', 'lightgray', 'mediumorchid', 'indigo'])
ax2.set_title('Perceived risk of for seasonal flu (%)')
ax2.set_xlabel('')
ax2.set_xticklabels(["1 - Very low", "2 - Somewhat low", "3 - Don't know", "4 - Somewhat high",
                                  "5 - Very high"], fontsize='small')
ax2.set_ylim([0,0.45]) # setting the limits for the y-axis


fig.subplots_adjust(hspace=0.3)
plt.show()

- Surprisingly, the risk for seasonal flu is estimated higher than the risk for H1N1. 
- Next step: investigate on the level of knowledge and the level of concern for H1N1.

In [None]:
data_x = df.query('opinion_h1n1_risk != 3.0')
sns.countplot(x='opinion_h1n1_risk', data=data_x)

In [None]:
# Plotting concerns about H1N1 
x_concern = df.h1n1_concern.value_counts(normalize=True)
x_concern_labels = ['Not at all concerned', 'Not very concerned', 'Somewhat concerned', 'Very concerned']

fig = px.pie(df, values=x_concern, names=x_concern_labels, title='Concerns about H1N1', template='ggplot2', hole=0.3, width=600, height=500)
fig.show()

- About every 10th is very concerned about H1N1, 16% are somewhat concerned.
- The vast majority is not at all or not very concerned. 
- This is in line with the findings for risk perception where we could see that risk awareness for H1N1 falls behind with regard to seasonal flu. 

In [None]:
# Plotting knowledge about H1N1 
x_knowledge = df.h1n1_knowledge.value_counts(normalize=True)
x_knowledge_labels = ['No knowledge', 'A little knowledge', 'A lot of knowledge']

fig = px.pie(df, values=x_knowledge, names=x_knowledge_labels, title='Knowledge about H1N1', template='ggplot2', hole=0.3, width=600, height=500)
fig.show()

- The majority of people states to have no knowledge about the H1N1 flu variant. 
- 37% have a little knowledge.
- Overall, there seems to be little literacy about this new pandemic disease. 

In [None]:
df.age_group.value_counts()

In [None]:
# we will also change the age_group column into categorical 
df['age_group'] = pd.Categorical(df['age_group'], ordered=True)

In [None]:
# We will create an empty list in which we will store values
opinion = []

# For each row in our data frame, we look at what is in the column "yr_renovated".
for idx, yr_re in king_county_data.yr_renovated.iteritems():
    # if "yr_renovated" is 0 or contains no value, we store the year of construction of the house in our empty listes ab
    if yr_re != “3 - Don’t know”:
        yr_changed.append(king_county_data.yr_built[idx])
        #last_known_change.append(kc_data.yr_built[idx])
    else:
        xxx.append()
    # if there is a value other than 0 in the column "yr_renovated", we transfer this value into our new list


# We create a new column and take over the values of our previously created list
king_county_data['yr_changed'] = yr_changed

In [None]:
values_h1n1 = df.h1n1_vaccine.value_counts(normalize=True)
values_seasonal = df.seasonal_vaccine.value_counts(normalize=True)
categories = pd.melt(df, value_vars=['h1n1_vaccine', 'seasonal_vaccine'])

'''fig = px.bar(df, x="sex", y="total_bill",
             color='smoker', barmode='group',
             height=400)
fig.show()'''

In [None]:
### Not needed because this does not help:

# Changing the labels for the columns opinion_seas_risk and opinion_h1n1_risk

labels_risk = {"opinion_h1n1_risk": {1.0: "1 - Very low", 2.0: "2 - Somewhat low", 3.0: "3 - Don't know", 4.0: "4 - Somewhat high",
                                  5.0: "5 - Very high"},
            "opinion_seas_risk": {1.0: "1 - Very low", 2.0: "2 - Somewhat low", 3.0: "3 - Don't know", 4.0: "4 - Somewhat high",
                                  5.0: "5 - Very high"},
                                  }

In [None]:
### WILL NOT BE APPLIED 
df_label = df.replace(labels_risk)