# Immunization survey
I'll be lookin at 2018 data on immunization from the Centers for Desease Control and Prevention. 

# Education level
The function, called `proportion_of_education`, returns the proportion of children in the dataset who had a mother with the education levels equal to less than high school (<12), high school (12), more than high school but not a college graduate (>12) and college degree.

In [1]:
import pandas as pd
df_1 = pd.read_csv("data_immunization.csv", index_col=0)
df_1

Unnamed: 0,SEQNUMC,SEQNUMHH,PDAT,PROVWT_D,RDDWT_D,STRATUM,YEAR,AGECPOXR,HAD_CPOX,AGEGRP,...,XVRCTY2,XVRCTY3,XVRCTY4,XVRCTY5,XVRCTY6,XVRCTY7,XVRCTY8,XVRCTY9,INS_STAT2_I,INS_BREAK_I
1,128521,12852,2,,235.916956,1031,2017,,2,1,...,,,,,,,,,,
2,10741,1074,2,,957.353840,1068,2017,,2,1,...,,,,,,,,,,
3,220011,22001,2,,189.611299,1050,2017,,2,3,...,,,,,,,,,,
4,86131,8613,1,675.430817,333.447418,1040,2017,,2,1,...,,,,,,,,,1.0,2.0
5,227141,22714,1,482.617748,278.768063,1008,2017,,2,1,...,,,,,,,,,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28461,19141,1914,2,,21.188088,2062,2017,,2,3,...,,,,,,,,,,
28462,121401,12140,2,,26.503010,2004,2017,,2,3,...,,,,,,,,,,
28463,128831,12883,2,,27.520652,2004,2017,,2,3,...,,,,,,,,,,
28464,82921,8292,2,,22.390587,2062,2017,,2,3,...,,,,,,,,,,


In [4]:
pd.set_option('display.max_columns', None)
print(df_1.columns)

Index(['SEQNUMC', 'SEQNUMHH', 'PDAT', 'PROVWT_D', 'RDDWT_D', 'STRATUM', 'YEAR',
       'AGECPOXR', 'HAD_CPOX', 'AGEGRP',
       ...
       'XVRCTY2', 'XVRCTY3', 'XVRCTY4', 'XVRCTY5', 'XVRCTY6', 'XVRCTY7',
       'XVRCTY8', 'XVRCTY9', 'INS_STAT2_I', 'INS_BREAK_I'],
      dtype='object', length=453)


In [5]:
df_1.info

<bound method DataFrame.info of        SEQNUMC  SEQNUMHH  PDAT    PROVWT_D     RDDWT_D  STRATUM  YEAR  \
1       128521     12852     2         NaN  235.916956     1031  2017   
2        10741      1074     2         NaN  957.353840     1068  2017   
3       220011     22001     2         NaN  189.611299     1050  2017   
4        86131      8613     1  675.430817  333.447418     1040  2017   
5       227141     22714     1  482.617748  278.768063     1008  2017   
...        ...       ...   ...         ...         ...      ...   ...   
28461    19141      1914     2         NaN   21.188088     2062  2017   
28462   121401     12140     2         NaN   26.503010     2004  2017   
28463   128831     12883     2         NaN   27.520652     2004  2017   
28464    82921      8292     2         NaN   22.390587     2062  2017   
28465   244621     24462     2         NaN   27.520652     2004  2017   

       AGECPOXR  HAD_CPOX  AGEGRP  BF_ENDR06  BF_EXCLR06  BF_FORMR08  \
1           NaN    

In [6]:
import pandas as pd
import numpy as np

def proportion_of_education():
    ''' 
    Returns the proportion of children in the dataset who 
    had a mother with the education levels equal to less than 
    high school (<12), high school (12), more than high school 
    but not a college graduate (>12) and college degree.
    '''
    df = pd.read_csv("data_immunization.csv", index_col=0)
    less_hs = df[df['EDUC1'] == 1]['EDUC1'].count()
    hs = df[df['EDUC1'] == 2]['EDUC1'].count()
    more_hs = df[df['EDUC1'] == 3]['EDUC1'].count()
    college = df[df['EDUC1'] == 4]['EDUC1'].count()
    
    total = less_hs + hs + more_hs + college
    less_hs_r = less_hs/total
    hs_r = hs/total
    more_hs_r = more_hs/total
    college_r = college/total 
    
    
    result = {"less than high school": less_hs_r,
    "high school": hs_r,
    "more than high school but not college": more_hs_r,
    "college": college_r}
    return result

df = proportion_of_education()
df

{'less than high school': 0.10202002459160373,
 'high school': 0.172352011241876,
 'more than high school but not college': 0.24588090637625154,
 'college': 0.47974705779026877}

# Consequences of getting milk
Let's explore the relationship between being fed breastmilk as a child and getting a seasonal influenza vaccine from a healthcare provider. 

In [7]:
def average_influenza_doses():
    '''
    Return a tuple of the average number of influenza 
    vaccines for those children we know received breastmilk 
    as a child and those who know did not.
    '''
    df = pd.read_csv("data_immunization.csv", index_col=0)
    fed_m_v = df[df['CBF_01']==1][['CBF_01', 'P_NUMFLU']]
    fed_m_v.dropna(inplace=True)
    notfed_m_v = df[df['CBF_01']==2][['CBF_01', 'P_NUMFLU']]
    notfed_m_v.dropna(inplace=True)
    fed_milk = fed_m_v[fed_m_v['CBF_01']==1]['CBF_01'].count()
    notfed_milk = notfed_m_v[notfed_m_v['CBF_01']==2]['CBF_01'].count()

    vac_for_milk = fed_m_v['P_NUMFLU'].mean()
    vac_for_notmilk = notfed_m_v['P_NUMFLU'].mean()

    return (vac_for_milk, vac_for_notmilk)
average_influenza_doses()

(1.8799187420058687, 1.5963945918878317)

# Gender and vaccine effectiveness
It would be interesting to see if there is any evidence of a link between vaccine effectiveness and sex of the child.

In [8]:
def chickenpox_by_sex():
    df = pd.read_csv("data_immunization.csv", index_col=0)
    df_varca = df[df['P_NUMVRC'] >= 1][['SEX', 'HAD_CPOX']]
    
    male = df_varca[df_varca['SEX'] == 1]['HAD_CPOX']
    female = df_varca[df_varca['SEX'] == 2]['HAD_CPOX']
    male[male == 1].count()
    ratio_m = male[male == 1].count() / male[male == 2].count()
    ratio_f = female[female == 1].count() / female[female == 2].count()
    return {"male": ratio_m, "female": ratio_f}

chickenpox_by_sex()

{'male': 0.009675583380762664, 'female': 0.0077918259335489565}

# Vaccine effectiveness
Let's look at the correlation between the use of the vaccine and whether it results in prevention of the infection. A correlation is a statistical relationship between two variables.

The 'had_chickenpox_column' is either '1' (for yes) or '2' (for no) and the 'num_chickenpox_vaccine_column' is the number of doses a child has been given of the varicella vaccine. 
A positive correlation (e.g., 'corr > 0') means that an increase in 'had_chickenpox_column' (which means more no's) would also increase the values of 'num_chickenpox_vaccine_column' (which means more doses of vaccine). If there is negative correlation (e.g., 'corr < 0'), it indicates that having had chickenpox is related to an increase in the number of vaccine doses.

In [9]:
def corr_chickenpox():
    import scipy.stats as stats
    import numpy as np
    import pandas as pd
    
    data = pd.read_csv("data_immunization.csv", index_col=0)

    cpox_num_vac = data[(data['HAD_CPOX'] == 1)|(data['HAD_CPOX'] == 2)][['HAD_CPOX', 'P_NUMVRC']]
    cpox_num_vac.dropna(inplace=True)
    had_cpox = cpox_num_vac['HAD_CPOX']
    num_vac = cpox_num_vac['P_NUMVRC']
    
    df = pd.DataFrame({"had_chickenpox_column": had_cpox,
                   "num_chickenpox_vaccine_column": num_vac})

    corr, pval = stats.pearsonr(df["had_chickenpox_column"],df["num_chickenpox_vaccine_column"])
    return corr

corr_chickenpox()

0.07044873460148118

This isn't really the full picture, since there are not data at when the dose was given. It's possible that children had chickenpox and then their parents went to get them the vaccine.