## EDA for data understanding and visualisations

In [1]:
# Importing the relevant libraries

import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

pd.set_option('display.max_columns', None)

RSEED = 42
# Modeling Libraries

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # pip install plotly needs to executed
import plotly.graph_objects as go

In [60]:
# Reading in the data 
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_2.csv')

## Presentation plots (in Plotly go)

In [61]:
df_dist_h1n1 = df.h1n1_vaccine.value_counts(normalize=True).round(2).rename_axis('Vacc').reset_index(name='counts')
df_dist_h1n1

Unnamed: 0,Vacc,counts
0,0,0.79
1,1,0.21


In [62]:
df_dist_seas = df.seasonal_vaccine.value_counts(normalize=True).round(2).rename_axis('Vacc').reset_index(name='counts')
df_dist_seas

Unnamed: 0,Vacc,counts
0,0,0.53
1,1,0.47


In [63]:
fig = go.Figure()
fig.add_trace(go.Bar(name="H1N1 vaccines", x=df_dist_h1n1.Vacc, y=df_dist_h1n1.counts, 
    marker_color='rgb(25,25,112)', text=df_dist_h1n1.counts,
            textposition='auto'))
fig.add_trace(go.Bar(name="Seasonal flu vaccine", x=df_dist_seas.Vacc, y=df_dist_seas.counts, 
    marker_color='rgb(188,143,143)', text=df_dist_seas.counts,
            textposition='auto'))


fig.layout = dict(title='Vaccination Status in Sample', 
    # This code removes the 3.0 from the plot (which was shown although value was dropped):
    #xaxis = dict(type="category", categoryorder='category ascending')
    )
fig.update_xaxes( 
    ticktext=["Not vaccinated", "Vaccinated"], 
    tickmode='array', tickvals = [0,1],
    tickangle=0,tickfont_size=14
    )
fig.update_yaxes(title='Share within sample')

fig.show()
fig.write_image("../images/distribution_target.png", height=400, width=800) 

In [19]:
df_concerns_h1n1 = df.h1n1_concern.value_counts(normalize=True).round(2).rename_axis('Level').reset_index(name='counts')
df_concerns_h1n1

Unnamed: 0,Level,counts
0,2.0,0.4
1,1.0,0.31
2,3.0,0.17
3,0.0,0.12


In [20]:
df_knowledge_h1n1 = df.h1n1_knowledge.value_counts(normalize=True).round(2).rename_axis('Level').reset_index(name='counts')
df_knowledge_h1n1

Unnamed: 0,Level,counts
0,1.0,0.55
1,2.0,0.36
2,0.0,0.09


In [21]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2, shared_yaxes=True,
    subplot_titles=("Concern about H1N1", "Knowledge about H1N1"))

fig.add_trace(
    go.Bar(x=df_concerns_h1n1.Level, y=df_concerns_h1n1.counts),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=df_knowledge_h1n1.Level, y=df_knowledge_h1n1.counts),
    row=1, col=2
)

fig.update_layout(height=500, width=900)
fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)
fig.update_xaxes(ticktext=['Not at all concerned', 'Not very concerned', 'Somewhat concerned', 'Very concerned'], 
    tickmode='array', tickvals = [0,1, 2, 3],
    tickangle=0,tickfont_size=7,
    row=1, col=1)
fig.update_xaxes(ticktext=['No knowledge', 'A little knowledge', 'A lot of knowledge'], 
    tickmode='array', tickvals = [0,1, 2],
    tickangle=0,tickfont_size=8,
    row=1, col=2)
#fig.update_traces(marker_color=['rgb(192,192,192)', 'rgb(230,230,250)','rgb(219,112,147)', 'rgb(255,0,255)'], row=1, col=1)
#fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [22]:
df_age = df.age_group.value_counts(normalize=True).round(2).rename_axis('groups').reset_index(name='counts')
df_age

Unnamed: 0,groups,counts
0,65+ Years,0.26
1,55 - 64 Years,0.21
2,45 - 54 Years,0.2
3,18 - 34 Years,0.2
4,35 - 44 Years,0.14


In [23]:
# adding the US census data to the frame (see xlxs in data)
df_age['counts_us'] = [0.17, 0.16, 0.20, 0.28, 0.19]
df_age

Unnamed: 0,groups,counts,counts_us
0,65+ Years,0.26,0.17
1,55 - 64 Years,0.21,0.16
2,45 - 54 Years,0.2,0.2
3,18 - 34 Years,0.2,0.28
4,35 - 44 Years,0.14,0.19


In [24]:
# we will also change the age_group column into categorical 
df['age_group'] = pd.Categorical(df['age_group'], ordered=True)

In [25]:
fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_age.groups, y=df_age.counts, marker_color='rgb(65,105,225)'))
fig.add_trace(go.Bar(name="US Census", x=df_age.groups, y=df_age.counts_us, marker_color='rgb(240,230,140)'))

fig.update_layout(title='Age distribution',
                   yaxis_title='Share within sample')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [26]:
# Bubble Chart
#size_sample = df_age['counts'] 
fig = go.Figure()
fig.add_trace(go.Scatter(name="Sample", x=df_age.groups, 
    y=df_age.counts, 
    mode='markers',
    marker=dict(
        color='rgb(65,105,225)',
        size=20,
        )
))
fig.add_trace(go.Scatter(name="US Census", x=df_age.groups, 
    y=df_age.counts_us, 
    mode='markers', 
    marker=dict(
        color='rgb(240,230,140)',
        size=15,
        )
 ))

fig.update_layout(title='Age distribution',
                   yaxis_title='Share within sample')
fig.update_xaxes(categoryorder='category ascending')
fig.update_yaxes(range=[0, 0.4])
fig.show()


In [27]:
fig = go.Figure()
fig.add_trace(go.Scatter(name="Sample", x=df_age.counts, 
    y=df_age.groups, 
    mode='markers',
    marker=dict(
        color='rgb(65,105,225)',
        size=20,
        )
))
fig.add_trace(go.Scatter(name="US Census", x=df_age.counts_us, 
    y=df_age.groups, 
    mode='markers', 
    marker=dict(
        color='rgb(240,230,140)',
        size=15,
        )
 ))

fig.update_layout(title='Age distribution',
                   yaxis_title='Share within sample',
                )
fig.update_yaxes(categoryorder='category ascending')
fig.update_xaxes(range=[0, 0.5])
fig.show()

In [28]:
# Stacked bar chart 

fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_age.groups, y=df_age.counts, marker_color='rgb(65,105,225)'))
fig.add_trace(go.Bar(name="US Census", x=df_age.groups, y=df_age.counts_us, marker_color='rgb(240,230,140)'))

fig.update_layout(title='Age distribution',
                   yaxis_title='Share within sample',
                   barmode='stack')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

### Distribution of gender

In [29]:
df_sex = df.sex.value_counts(normalize=True).round(2).rename_axis('groups').reset_index(name='counts')
df_sex

Unnamed: 0,groups,counts
0,Female,0.59
1,Male,0.41


In [30]:
df_sex['sex_us'] = [0.51, 0.49]
df_sex

Unnamed: 0,groups,counts,sex_us
0,Female,0.59,0.51
1,Male,0.41,0.49


In [31]:
fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_sex.groups, y=df_sex.counts, marker_color='rgb(65,105,225)', text='counts'))
fig.add_trace(go.Bar(name="US Census", x=df_sex.groups, y=df_sex.sex_us, marker_color='rgb(240,230,140)'))

fig.update_layout(title='Gender distribution',
                   yaxis_title='Share within sample',
                   barmode='stack')
fig.show()

Distribution of Ethnicities

In [32]:
df_eth = df.race.value_counts(normalize=True).round(2).rename_axis('groups').reset_index(name='counts')
df_eth

Unnamed: 0,groups,counts
0,White,0.79
1,Black,0.08
2,Hispanic,0.07
3,Other or Multiple,0.06


In [33]:
df_eth['counts_us'] = [0.71, 0.12, 0.15, 0.02]
df_eth

Unnamed: 0,groups,counts,counts_us
0,White,0.79,0.71
1,Black,0.08,0.12
2,Hispanic,0.07,0.15
3,Other or Multiple,0.06,0.02


In [34]:
fig = go.Figure()
fig.add_trace(go.Bar(name="Sample", x=df_eth.groups, y=df_eth.counts, marker_color='rgb(65,105,225)'))
fig.add_trace(go.Bar(name="US Census", x=df_eth.groups, y=df_eth.counts_us, marker_color='rgb(240,230,140)'))

fig.update_layout(title='Distribution of Ethnicities',
                   yaxis_title='Share within sample')
fig.update_xaxes(categoryorder='category descending')
fig.show()

In [35]:
# These values are different from the ones we had. 

### Trying to create more plots with plotly

In [36]:
fig_age = px.bar(df_age, x=['65+ years', '55 - 64 Years', '45 - 54 Years', '18 - 34 Years', '35 - 44 Years'], y='counts',
            #barmode='group',
            height=400)
fig_age.update_layout(title='Age distribution',
                   xaxis_title='Age group',
                   yaxis_title='Share within sample')
fig_age.update_xaxes(categoryorder='category descending')
fig_age.show()

In [37]:
df_plot = df[['h1n1_vaccine', 'h1n1_concern']]

In [38]:
df_x = df.groupby('h1n1_concern').h1n1_vaccine.value_counts()
df_x

h1n1_concern  h1n1_vaccine
0.0           0               2849
              1                447
1.0           0               6756
              1               1397
2.0           0               8102
              1               2473
3.0           0               3250
              1               1341
Name: h1n1_vaccine, dtype: int64

In [39]:
#non_vacc = df.query('h1n1_vaccine==0').groupby('h1n1_concern', as_index=True).agg('count')['h1n1_vaccine']

non_acc = df.query('h1n1_vaccine==0').groupby('h1n1_concern', as_index=True)["h1n1_vaccine"].count().reset_index(name="count")


In [40]:
non_acc['h1n1_vaccine'] = 0
non_acc

Unnamed: 0,h1n1_concern,count,h1n1_vaccine
0,0.0,2849,0
1,1.0,6756,0
2,2.0,8102,0
3,3.0,3250,0


In [41]:
data = df.groupby(["h1n1_concern","h1n1_vaccine"],as_index=True)["h1n1_concern"].count().reset_index(name="count")

In [42]:
data

Unnamed: 0,h1n1_concern,h1n1_vaccine,count
0,0.0,0,2849
1,0.0,1,447
2,1.0,0,6756
3,1.0,1,1397
4,2.0,0,8102
5,2.0,1,2473
6,3.0,0,3250
7,3.0,1,1341


In [43]:
'''fig = go.bar(data, x="h1n1_concern", y="count",
             height=400, color="h1n1_vaccine")
            
fig.update_layout(barmode='group')
fig.show()'''

fig = go.Figure(data=[
    go.Bar(name='Not vaccinated', x=data.query('h1n1_vaccine == 0')['h1n1_concern'], 
        y=data.query('h1n1_vaccine == 0')['count'], marker_color='rgb(72,61,139)'),
    go.Bar(name='Vaccinated', x=data.query('h1n1_vaccine == 1')['h1n1_concern'], 
        y=data.query('h1n1_vaccine == 1')['count'], marker_color='rgb(60,179,113)')
])
# Change the bar mode
fig.update_layout(barmode='group', title='Concerns about H1N1', barnorm='fraction')
fig.update_xaxes(
    ticktext=['Not at all concerned', 'Not very concerned', 'Somewhat concerned', 'Very concerned'], 
    tickmode='array', tickvals = [0,1, 2, 3])
fig.update_yaxes(title='Share of vaccinations')
fig.show()
fig.write_image("../images/concerns_vs_h1n1_vaccines.png", height=400, width=800) #the saved image does not look good. Need to see about which options there are for saving.

### Plotting perceived risk of H1N1 against risk of seasonal flu

In [44]:
df_risk_h1n1 = df.opinion_h1n1_risk.value_counts(normalize=True).round(2).rename_axis('rating').reset_index(name='counts')

# we are dropping rating 3.0 because this is 'dont know' and we sort the df by rating
df_risk_h1n1.drop(df_risk_h1n1[df_risk_h1n1.rating == 3.0].index, inplace=True)
df_risk_h1n1.sort_values(by=['rating']).reset_index(drop=True)

Unnamed: 0,rating,counts
0,1.0,0.31
1,2.0,0.38
2,4.0,0.2
3,5.0,0.07


In [45]:
df_risk_seas = df.opinion_seas_risk.value_counts(normalize=True).round(2).rename_axis('rating').reset_index(name='counts')

# we are dropping rating 3.0 because this is 'dont know' and we sort the df by rating
df_risk_seas.drop(df_risk_seas[df_risk_seas.rating == 3.0].index, inplace=True)
df_risk_seas.sort_values(by=['rating']).reset_index(drop=True)

Unnamed: 0,rating,counts
0,1.0,0.23
1,2.0,0.34
2,4.0,0.29
3,5.0,0.11


In [46]:
fig = go.Figure()
fig.add_trace(go.Bar(name="H1N1 risk", x=df_risk_h1n1.rating, y=df_risk_h1n1.counts, marker_color='rgb(25,25,112)'))
fig.add_trace(go.Bar(name="Seasonal flu risk", x=df_risk_seas.rating, y=df_risk_seas.counts, marker_color='rgb(188,143,143)'))


fig.layout = dict(title='Opinion of Risk of Flu Types', 
    # This code removes the 3.0 from the plot (which was shown although value was dropped):
    xaxis = dict(type="category", categoryorder='category ascending')
    )
fig.update_xaxes( 
    ticktext=["Very low", "Somewhat low", "Somewhat high","Very high"], 
    tickmode='array', tickvals = [1,2, 4, 5],
    tickangle=0,tickfont_size=12
    )
fig.update_yaxes(title='Share within sample')

fig.show()
fig.write_image("../images/opinion_of_risk_both.png", height=400, width=800) 

### Opinion of vaccine effectiveness against vaccination status

In [47]:
df_eff_h1n1 = df.opinion_h1n1_vacc_effective.value_counts(normalize=True).round(2).rename_axis('rating_h1n1').reset_index(name='counts')

# we are dropping rating 3.0 because this is 'dont know' and we sort the df by rating
df_eff_h1n1.drop(df_eff_h1n1[df_eff_h1n1.rating_h1n1 == 3.0].index, inplace=True)
df_eff_h1n1.sort_values(by=['rating_h1n1']).reset_index(drop=True)

Unnamed: 0,rating_h1n1,counts
0,1.0,0.03
1,2.0,0.07
2,4.0,0.44
3,5.0,0.27


In [48]:
df_eff_seas = df.opinion_seas_vacc_effective.value_counts(normalize=True).round(2).rename_axis('rating_seas').reset_index(name='counts')

# we are dropping rating 3.0 because this is 'dont know' and we sort the df by rating
df_eff_seas.drop(df_eff_seas[df_eff_seas.rating_seas == 3.0].index, inplace=True)
df_eff_seas.sort_values(by=['rating_seas']).reset_index(drop=True)

Unnamed: 0,rating_seas,counts
0,1.0,0.05
1,2.0,0.08
2,4.0,0.44
3,5.0,0.38


In [49]:
fig = go.Figure()
fig.add_trace(go.Bar(name="H1N1 effective", x=df_eff_h1n1.rating_h1n1, y=df_eff_h1n1.counts, marker_color='rgb(25,25,112)'))
fig.add_trace(go.Bar(name="Seasonal effective", x=df_eff_seas.rating_seas, y=df_eff_seas.counts, marker_color='rgb(188,143,143)'))


fig.layout = dict(title='Opinion Vaccine Effectiveness', 
    # This code removes the 3.0 from the plot (which was shown although value was dropped):
    xaxis = dict(type="category", categoryorder='category ascending')
    )
fig.update_xaxes( 
    ticktext=["Not at all effective", "Not very effective", "Somewhat effective","Very effective"], 
    tickmode='array', tickvals = [1,2, 4, 5],
    tickangle=0,tickfont_size=12
    )
fig.update_yaxes(title='Share within sample')

fig.show()
fig.write_image("../images/opinion_vaccine_effectiveness.png", height=400, width=800) 

### Plots for age groups

In [50]:
data_age = df.groupby(["age_group","h1n1_vaccine"],as_index=True)["age_group"].count().reset_index(name="count")
data_age

Unnamed: 0,age_group,h1n1_vaccine,count
0,18 - 34 Years,0,4224
1,18 - 34 Years,1,991
2,35 - 44 Years,0,3087
3,35 - 44 Years,1,761
4,45 - 54 Years,0,4218
5,45 - 54 Years,1,1020
6,55 - 64 Years,0,4212
7,55 - 64 Years,1,1351
8,65+ Years,0,5292
9,65+ Years,1,1551


In [51]:
fig = go.Figure(data=[
    go.Bar(name='Not vaccinated', x=data_age.query('h1n1_vaccine == 0')['age_group'], 
        y=data_age.query('h1n1_vaccine == 0')['count'], marker_color='rgb(72,61,139)'),
    go.Bar(name='Vaccinated', x=data_age.query('h1n1_vaccine == 1')['age_group'], 
        y=data_age.query('h1n1_vaccine == 1')['count'], marker_color='rgb(60,179,113)')
])
# Change the bar mode
fig.update_layout(barmode='group', title='H1N1 Vaccinations by Age Group', barnorm='fraction', yaxis_range=[0,0.9])
fig.update_xaxes(
    ticktext=['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'], 
    tickmode='array', tickvals = [0,1, 2, 3, 4])
fig.update_yaxes(title='Share of vaccinations')
fig.show()
fig.write_image("../images/h1n1_age_groups.png", height=400, width=800) 

In [52]:
data_age_seas = df.groupby(["age_group","seasonal_vaccine"],as_index=True)["age_group"].count().reset_index(name="count")
data_age_seas

Unnamed: 0,age_group,seasonal_vaccine,count
0,18 - 34 Years,0,3731
1,18 - 34 Years,1,1484
2,35 - 44 Years,0,2453
3,35 - 44 Years,1,1395
4,45 - 54 Years,0,3136
5,45 - 54 Years,1,2102
6,55 - 64 Years,0,2719
7,55 - 64 Years,1,2844
8,65+ Years,0,2233
9,65+ Years,1,4610


In [53]:
fig = go.Figure(data=[
    go.Bar(name='Not vaccinated', x=data_age_seas.query('seasonal_vaccine == 0')['age_group'], 
        y=data_age_seas.query('seasonal_vaccine == 0')['count'], marker_color='rgb(72,61,139)'),
    go.Bar(name='Vaccinated', x=data_age_seas.query('seasonal_vaccine == 1')['age_group'], 
        y=data_age_seas.query('seasonal_vaccine == 1')['count'], marker_color='rgb(60,179,113)')
])
# Change the bar mode
fig.update_layout(barmode='group', title='Seasonal Vaccinations by Age Group', barnorm='fraction', yaxis_range=[0,0.9])
fig.update_xaxes(
    ticktext=['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'], 
    tickmode='array', tickvals = [0,1, 2, 3, 4])
fig.update_yaxes(title='Share of vaccinations')
fig.show()
fig.write_image("../images/seasonal_age_groups.png", height=400, width=800) 

### Additional visualisations

In [54]:
df_report = pd.read_csv('/Users/julianeberek/neuefische/TheFluShot/data/FluNetInteractiveReport.csv', skiprows=3)

In [55]:
df_report.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            75 non-null     object 
 1   WHOREGION          75 non-null     object 
 2   FLUREGION          75 non-null     object 
 3   Year               75 non-null     int64  
 4   Week               75 non-null     int64  
 5   SDATE              75 non-null     object 
 6   EDATE              75 non-null     object 
 7   SPEC_RECEIVED_NB   75 non-null     int64  
 8   SPEC_PROCESSED_NB  75 non-null     int64  
 9   AH1                75 non-null     int64  
 10  AH1N12009          61 non-null     float64
 11  AH3                75 non-null     int64  
 12  AH5                40 non-null     float64
 13  ANOTSUBTYPED       75 non-null     int64  
 14  INF_A              75 non-null     int64  
 15  BYAMAGATA          40 non-null     float64
 16  BVICTORIA          40 non-nu

In [56]:
df_report['formatted_date'] = df_report.Year * 1000 + df_report.Week * 10 + 0
df_report['date'] = pd.to_datetime(df_report['formatted_date'], format='%Y%W%w')
df_report.head()

Unnamed: 0,Country,WHOREGION,FLUREGION,Year,Week,SDATE,EDATE,SPEC_RECEIVED_NB,SPEC_PROCESSED_NB,AH1,AH1N12009,AH3,AH5,ANOTSUBTYPED,INF_A,BYAMAGATA,BVICTORIA,BNOTDETERMINED,INF_B,ALL_INF,ALL_INF2,TITLE,formatted_date,date
0,United States of America,Region of the Americas of WHO,North America,2009,1,2008-12-29,2009-01-04,6613,6613,111,,19,,177,307,,,50,50,357,,Sporadic,2009010,2009-01-11
1,United States of America,Region of the Americas of WHO,North America,2009,2,2009-01-05,2009-01-11,6980,6980,163,,27,,300,490,,,78,78,568,,Sporadic,2009020,2009-01-18
2,United States of America,Region of the Americas of WHO,North America,2009,3,2009-01-12,2009-01-18,7024,7024,195,,22,,421,638,,,104,104,742,,Sporadic,2009030,2009-01-25
3,United States of America,Region of the Americas of WHO,North America,2009,4,2009-01-19,2009-01-25,7959,7959,353,,46,,666,1065,,,189,189,1254,,Local Outbreak,2009040,2009-02-01
4,United States of America,Region of the Americas of WHO,North America,2009,5,2009-01-26,2009-02-01,9496,9496,555,,78,,940,1573,,,380,380,1953,,Regional Outbreak,2009050,2009-02-08


In [57]:
# Plotting the weekly H1N1 cases 
fig = px.line(df_report, x="date", y="AH1N12009", title='H1N1 Confirmed Cases (weekly)')
fig.show()

In [58]:
# Plotting H1N1 cases against all influenza cases (all types A and B)

fig = go.Figure()
date = df_report['date']
h1n1 = df_report['AH1N12009']
all_a = df_report['INF_A']
all_inf = df_report['ALL_INF']

fig.add_trace(go.Scatter(x=date, y=h1n1,
                    mode='lines',
                    name='H1N1 cases'))
'''fig.add_trace(go.Scatter(x=date, y=all_a,
                    mode='lines+markers',
                    name='All type A influenza cases'))'''
fig.add_trace(go.Scatter(x=date, y=all_inf,
                    mode='lines', name='Total number influenza cases',
                    marker_color='rgb(25,25,112)'))

fig.update_layout(title='Weekly cases of H1N1 vs. all influenza cases (US)',
                   xaxis_title='Month',
                   yaxis_title='Number of confirmed cases')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
fig.show()
fig.write_image("../images/weekly_cases.png", height=400, width=700)