## EDA for data understanding and visualisations

In [None]:
# Importing the relevant libraries

import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

pd.set_option('display.max_columns', None)

RSEED = 42
# Modeling Libraries

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # pip install plotly needs to executed
import plotly.graph_objects as go

In [None]:
# We need to add plotly to our requirements

In [None]:
# Reading in the data 
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_1.csv')

In [None]:
df.info()

In [None]:
# Looking at the distribution of our target variables 

print(df.h1n1_vaccine.value_counts(normalize=True))
print(df.seasonal_vaccine.value_counts(normalize=True))

In [None]:
sns.countplot(x='h1n1_vaccine', data=df)

In [None]:
# Creating plots for the distribution of our target variables 

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11,3))

df['h1n1_vaccine'].value_counts(normalize=True).plot(kind='bar', ax=ax1, color=['dimgrey', 'forestgreen'], rot=0)
ax1.set_title('Distribution of H1N1 vaccinations (%)')
ax1.set_xlabel('')
ax1.set_xticklabels(['Not vaccinated', 'Vaccinated'])
ax1.set_ylim([0,1]) # setting the limits for the y-axis

df['seasonal_vaccine'].value_counts(normalize=True).plot(kind='bar', ax=ax2, color=['dimgrey', 'forestgreen'], rot=0)
ax2.set_title('Distribution of seasonal flu vaccinations (%)')
ax2.set_xlabel('')
ax2.set_xticklabels(['Not vaccinated', 'Vaccinated'])
ax2.set_ylim([0,1]) # setting the limits for the y-axis

fig.subplots_adjust(hspace=0.3)
plt.savefig('../images/distribution_target.jpg')
plt.show()


In [None]:
df.groupby('h1n1_vaccine')['seasonal_vaccine'].value_counts(sort=False)

In [None]:
# Creating plots for the distribution of overlaps between H1N1 and seasonal flu vaccines

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,3))

df.groupby('h1n1_vaccine').seasonal_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', color=['dimgrey', 'forestgreen'], ax=ax1,  rot=0)
ax1.set_title('Seasonal vaccinations by H1N1 vaccine status')
ax1.set_xlabel('')
ax1.set_xticklabels(['Not vaccinated', 'Vaccinated', 'Not vaccinated', 'Vaccinated'])
ax1.set_ylim([0,1])

df.groupby('seasonal_vaccine').h1n1_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, color=['dimgrey', 'forestgreen'], rot=0)
ax2.set_title('H1N1 vaccinations by seasonal vaccine status')
ax2.set_xlabel('')
ax2.set_xticklabels(['Not vaccinated', 'Vaccinated', 'Not vaccinated', 'Vaccinated'])
ax2.set_ylim([0,1])

fig.subplots_adjust(hspace=0.3)
plt.savefig('../images/distribution_target_compared.jpg')
plt.show()


- The share of people with seasonal flu vaccination is higher among people that did get an H1N1 vaccine.
- The share of people with H1N1 vaccination is higher among people that also go the seasonal flu vaccine.
- The graphs show that there seems to be a correlation between both types of vaccinations. 

- The graphs are not self explanatory. There needs to be a better indication. 

## Influence of doctor's recommendations 

In [None]:
# Creating plots for showing if vaccinations have been recommended by a doctor

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,3))

df.groupby('doctor_recc_h1n1').h1n1_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', color=['dimgrey', 'forestgreen'], ax=ax1,  rot=0)
ax1.set_title('H1N1 vaccine status by doctor recommendation')
ax1.set_xlabel('')
ax1.set_xticklabels(['-reco / -vacc', '-reco / +vacc', '+reco / -vacc', '+reco / +vacc'])
ax1.set_ylim([0,1])

df.groupby('doctor_recc_seasonal').seasonal_vaccine.value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, color=['dimgrey', 'forestgreen'], rot=0)
ax2.set_title('Seasonal vaccine status by doctor recommendation')
ax2.set_xlabel('')
ax2.set_xticklabels(['-reco / -vacc', '-reco / +vacc', '+reco / -vacc', '+reco / +vacc'])
ax2.set_ylim([0,1])
# ax2.legend(title='color', bbox_to_anchor=(1.05, 1), loc='upper left')

fig.subplots_adjust(hspace=0.3)

plt.savefig('../images/target_recommendations.jpg')
plt.show()


- Recommendations of the respective vaccine are important for getting the vaccine.
- Only a small proportion of people that did not get a recommendation, got the vaccine. 
- For seasonal flu vaccinations, the share of people that did not get a recommendation from a doctor is higher than for H1N1.

- In order to get some of the columns into Categorical mode, they will be changed.
- Otherwise, plots will adapt different strategies for displaying the order of them (sort=False does not work)

In [None]:
df['opinion_h1n1_risk'] = pd.Categorical(df['opinion_h1n1_risk'], ordered=True)
df['opinion_seas_risk'] = pd.Categorical(df['opinion_seas_risk'], ordered=True)

df['h1n1_concern'] = pd.Categorical(df['h1n1_concern'], ordered=True)
df['h1n1_knowledge'] = pd.Categorical(df['h1n1_knowledge'], ordered=True)

### Investigating Vaccinations by risk awareness

In [None]:
# Splitting the dataset into vaccinated and non vaccinated for each vaccine type
plot_h1n1_no = df.loc[df['h1n1_vaccine']== 0]
plot_h1n1_yes = df.loc[df['h1n1_vaccine']== 1]
plot_seasonal_no = df.loc[df['seasonal_vaccine']== 0]
plot_seasonal_yes = df.loc[df['seasonal_vaccine']== 1]

fig, ((ax1, ax2), (ax3, ax4))  = plt.subplots(2, 2, figsize=(15,6))

plot_h1n1_no['opinion_h1n1_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax1, rot=0, color='dimgrey')
ax1.set_title('Non H1N1 vaccinated: Opinion of risk (%)')
ax1.set_xlabel('')
ax1.set_ylim([0,1]) # setting the limits for the y-axis

plot_h1n1_yes['opinion_h1n1_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, rot=0, color='forestgreen')
ax2.set_title('H1N1 vaccinated: Opinion of risk (%)')
ax2.set_xlabel('')
ax2.set_ylim([0,1]) # setting the limits for the y-axis

plot_seasonal_no['opinion_seas_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax3, rot=0, color='dimgrey')
ax3.set_title('Seasonal non vaccinated: Opinion of risk (%)')
ax3.set_xlabel('')
ax3.set_ylim([0,1]) # setting the limits for the y-axis

plot_seasonal_yes['opinion_seas_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax4, rot=0, color='forestgreen')
ax4.set_title('Seasonal vaccinated: Opinion of risk  (%)')
ax4.set_xlabel('')
ax4.set_ylim([0,1]) # setting the limits for the y-axis

fig.subplots_adjust(hspace=0.3)
plt.show()

In [None]:
# Plotting the overall risk perception for H1N1 and for seasonal flu

fig, (ax1, ax2)  = plt.subplots(1, 2, figsize=(15,5))

df['opinion_h1n1_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax1, rot=0, color=['peachpuff', 'lightpink', 'lightgray', 'mediumorchid', 'indigo']
)
ax1.set_title('Perceived risk of H1N1 (%)')
ax1.set_xlabel('')
ax1.set_xticklabels(["1 - Very low", "2 - Somewhat low", "3 - Don't_know", "4 - Somewhat high",
                                 "5 - Very high"], fontsize='small')
ax1.set_ylim([0,0.45]) # setting the limits for the y-axis

df['opinion_seas_risk'].value_counts(normalize=True, sort=False).plot(kind='bar', ax=ax2, rot=0, color=['peachpuff', 'lightpink', 'lightgray', 'mediumorchid', 'indigo'])
ax2.set_title('Perceived risk of for seasonal flu (%)')
ax2.set_xlabel('')
ax2.set_xticklabels(["1 - Very low", "2 - Somewhat low", "3 - Don't know", "4 - Somewhat high",
                                  "5 - Very high"], fontsize='small')
ax2.set_ylim([0,0.45]) # setting the limits for the y-axis


fig.subplots_adjust(hspace=0.3)

plt.savefig('../images/risk_perception.jpg')
plt.show()


- Surprisingly, the risk for seasonal flu is estimated higher than the risk for H1N1. 
- Next step: investigate on the level of knowledge and the level of concern for H1N1.

In [None]:
data_x = df.query('opinion_h1n1_risk != 3.0')
sns.countplot(x='opinion_h1n1_risk', data=data_x)

In [None]:
# Plotting concerns about H1N1 
# note that an image can only be printed once the kaleido package is installed: pip install -U kaleido

x_concern = df.h1n1_concern.value_counts(normalize=True)
x_concern_labels = ['Not at all concerned', 'Not very concerned', 'Somewhat concerned', 'Very concerned']

fig = px.pie(df, values=x_concern, names=x_concern_labels, title='Concerns about H1N1', template='ggplot2', hole=0.3, width=600, height=500)
fig.show()
fig.write_image("../images/concerns_h1n1.png")

- About every 10th is very concerned about H1N1, 16% are somewhat concerned.
- The vast majority is not at all or not very concerned. 
- This is in line with the findings for risk perception where we could see that risk awareness for H1N1 falls behind with regard to seasonal flu. 

In [None]:
# Plotting knowledge about H1N1 
x_knowledge = df.h1n1_knowledge.value_counts(normalize=True)
x_knowledge_labels = ['No knowledge', 'A little knowledge', 'A lot of knowledge']

fig = px.pie(df, values=x_knowledge, names=x_knowledge_labels, title='Knowledge about H1N1', template='ggplot2', hole=0.3, width=600, height=500)
fig.show()
fig.write_image("../images/knowledge_h1n1.png")


- The majority of people states to have no knowledge about the H1N1 flu variant. 
- 37% have a little knowledge.
- Overall, there seems to be little literacy about this new pandemic disease. 

In [None]:
df_concerns_h1n1 = df.h1n1_concern.value_counts(normalize=True).round(2).rename_axis('Level').reset_index(name='counts')
df_concerns_h1n1

In [None]:
df_knowledge_h1n1 = df.h1n1_knowledge.value_counts(normalize=True).round(2).rename_axis('Level').reset_index(name='counts')
df_knowledge_h1n1

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2, shared_yaxes=True,
    subplot_titles=("Concern about H1N1", "Knowledge about H1N1"))

fig.add_trace(
    go.Bar(x=df_concerns_h1n1.Level, y=df_concerns_h1n1.counts),
    row=1, col=1,
    
)

fig.add_trace(
    go.Bar(x=df_knowledge_h1n1.Level, y=df_knowledge_h1n1.counts),
    row=1, col=2
)

fig.update_layout(height=400, width=800)
fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)

fig.show()

In [None]:
df_age = df.age_group.value_counts(normalize=True).round(2).rename_axis('groups').reset_index(name='counts')
df_age

In [None]:
# adding the US census data to the frame (see xlxs in data)
df_age['counts_us'] = [0.17, 0.16, 0.20, 0.28, 0.19]
df_age

In [None]:
# we will also change the age_group column into categorical 
df['age_group'] = pd.Categorical(df['age_group'], ordered=True)

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_age.groups, y=df_age.counts, marker_color='rgb(65,105,225)'))
fig.add_trace(go.Bar(name="US Census", x=df_age.groups, y=df_age.counts_us, marker_color='rgb(238,232,170)'))

fig.update_layout(title='Age distribution',
                   yaxis_title='Share within sample')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [None]:
# Bubble Chart
#size_sample = df_age['counts'] 
fig = go.Figure()
fig.add_trace(go.Scatter(name="Sample", x=df_age.groups, 
    y=df_age.counts, 
    mode='markers',
    marker=dict(
        color='rgb(65,105,225)',
        size=20,
        )
))
fig.add_trace(go.Scatter(name="US Census", x=df_age.groups, 
    y=df_age.counts_us, 
    mode='markers', 
    marker=dict(
        color='rgb(238,232,170)',
        size=15,
        )
 ))

fig.update_layout(title='Age distribution',
                   yaxis_title='Share within sample')
fig.update_xaxes(categoryorder='category ascending')
fig.update_yaxes(range=[0, 0.4])
fig.show()


In [None]:
# Stacked bar chart 

fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_age.groups, y=df_age.counts, marker_color='rgb(65,105,225)'))
fig.add_trace(go.Bar(name="US Census", x=df_age.groups, y=df_age.counts_us, marker_color='rgb(238,232,170)'))

fig.update_layout(title='Age distribution',
                   yaxis_title='Share within sample',
                   barmode='stack')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

### Distribution of gender

In [None]:
df_sex = df.sex.value_counts(normalize=True).round(2).rename_axis('groups').reset_index(name='counts')
df_sex

In [None]:
df_sex['sex_us'] = [0.51, 0.49]
df_sex

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_sex.groups, y=df_sex.counts, marker_color='rgb(65,105,225)', text='counts'))
fig.add_trace(go.Bar(name="US Census", x=df_sex.groups, y=df_sex.sex_us, marker_color='rgb(238,232,170)'))

fig.update_layout(title='Gender distribution',
                   yaxis_title='Share within sample',
                   barmode='stack')
fig.show()

Distribution of Ethnicities

In [None]:
df_eth = df.race.value_counts(normalize=True).round(2).rename_axis('groups').reset_index(name='counts')
df_eth

In [None]:
df_eth['counts_us'] = [0.71, 0.12, 0.15, 0.02]
df_eth

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_eth.groups, y=df_eth.counts, marker_color='rgb(65,105,225)'))
fig.add_trace(go.Bar(name="US Census", x=df_eth.groups, y=df_eth.counts_us, marker_color='rgb(238,232,170)'))

fig.update_layout(title='Distribution of Ethnicities',
                   yaxis_title='Share within sample')
fig.update_xaxes(categoryorder='category descending')
fig.show()

In [None]:
# These values are different from the ones we had. 

### Trying to create more plots with plotly

In [None]:
fig_age = px.bar(df_age, x=['65+ years', '55 - 64 Years', '45 - 54 Years', '18 - 34 Years', '35 - 44 Years'], y='counts',
            #barmode='group',
            height=400)
fig_age.update_layout(title='Age distribution',
                   xaxis_title='Age group',
                   yaxis_title='Share within sample')
fig_age.update_xaxes(categoryorder='category descending')
fig_age.show()

In [None]:
df_plot = df[['h1n1_vaccine', 'h1n1_concern']]

In [None]:
df_x = df.groupby('h1n1_concern').h1n1_vaccine.value_counts()
df_x

In [None]:
#non_vacc = df.query('h1n1_vaccine==0').groupby('h1n1_concern', as_index=True).agg('count')['h1n1_vaccine']

non_acc = df.query('h1n1_vaccine==0').groupby('h1n1_concern', as_index=True)["h1n1_vaccine"].count().reset_index(name="count")


In [None]:
non_acc['h1n1_vaccine'] = 0
non_acc

In [None]:
data = df.groupby(["h1n1_concern","h1n1_vaccine"],as_index=True)["h1n1_concern"].count().reset_index(name="count")

In [None]:
data

In [None]:
'''fig = go.bar(data, x="h1n1_concern", y="count",
             height=400, color="h1n1_vaccine")
            
fig.update_layout(barmode='group')
fig.show()'''

fig = go.Figure(data=[
    go.Bar(name='Not vaccined', x=data.query('h1n1_vaccine == 0')['h1n1_concern'], y=data.query('h1n1_vaccine == 0')['count']),
    go.Bar(name='Vaccined', x=data.query('h1n1_vaccine == 1')['h1n1_concern'], y=data.query('h1n1_vaccine == 1')['count'])
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_xaxes(title='Concern of H1N1', 
    ticktext=['Not at all concerned 1', 'Not very concerned', 'Somewhat concerned', 'Very concerned'], 
    tickmode='array', tickvals = [0,1, 2, 3])
fig.update_yaxes(title='Count of vaccinations')
fig.show()
fig.write_image("../images/concerns_vs_h1n1_vaccines.png") #the saved image does not look good. Need to see about which options there are for saving.

In [None]:
### Not needed because this does not help:

# Changing the labels for the columns opinion_seas_risk and opinion_h1n1_risk

labels_risk = {"opinion_h1n1_risk": {1.0: "1 - Very low", 2.0: "2 - Somewhat low", 3.0: "3 - Don't know", 4.0: "4 - Somewhat high",
                                  5.0: "5 - Very high"},
            "opinion_seas_risk": {1.0: "1 - Very low", 2.0: "2 - Somewhat low", 3.0: "3 - Don't know", 4.0: "4 - Somewhat high",
                                  5.0: "5 - Very high"},
                                  }

### Additional visualisations

In [None]:
df_report = pd.read_csv('/Users/julianeberek/neuefische/TheFluShot/data/FluNetInteractiveReport.csv', skiprows=3)

In [None]:
df_report.info()

In [None]:
df_report['formatted_date'] = df_report.Year * 1000 + df_report.Week * 10 + 0
df_report['date'] = pd.to_datetime(df_report['formatted_date'], format='%Y%W%w')
df_report.head()

In [None]:
# Plotting the weekly H1N1 cases 
fig = px.line(df_report, x="date", y="AH1N12009", title='H1N1 Confirmed Cases (weekly)')
fig.show()

In [None]:
# Plotting H1N1 cases against all influenza cases (all types A and B)

fig = go.Figure()
date = df_report['date']
h1n1 = df_report['AH1N12009']
all_a = df_report['INF_A']
all_inf = df_report['ALL_INF']

fig.add_trace(go.Scatter(x=date, y=h1n1,
                    mode='lines',
                    name='H1N1 cases'))
'''fig.add_trace(go.Scatter(x=date, y=all_a,
                    mode='lines+markers',
                    name='All type A influenza cases'))'''
fig.add_trace(go.Scatter(x=date, y=all_inf,
                    mode='lines', name='Total number influenza cases'))

fig.update_layout(title='Weekly cases of H1N1 vs. all influenza cases',
                   xaxis_title='Month',
                   yaxis_title='Number of confirmed cases')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
fig.show()
fig.write_image("../images/weekly_cases.png")