# US births analysis

## Dataset 1: US births 1994-2003 (CDC)

In [1]:
input_str = open('US_births_1994-2003_CDC_NCHS.csv','r').read()
input_lst = input_str.split('\n')
input_lst[0:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

## Read csv

In [2]:
def read_csv(filename):
    str = open(filename,'r').read()
    string_list = str.split('\n')[1:]
    final_list = []
    for row in string_list:
        int_fields = []
        string_fields = row.split(',')
        for field in string_fields:
            int_fields.append(int(field))
        final_list.append(int_fields)
    return final_list

In [3]:
cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
cdc_list[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

## Birth counts by column

In [4]:
def calc_counts(datalist,column):
    dico = {}
    for data in datalist:
        col = data[column]
        births = data[4]
        if col in dico:
            dico[col] += births
        else:
            dico[col] = births
    return dico

In [5]:
# US birth count by year
cdc_year_births = calc_counts(cdc_list,0)
cdc_year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

In [6]:
# US birth count by month
cdc_month_births = calc_counts(cdc_list,1)
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [7]:
# US birth count by day of month
cdc_dom_births = calc_counts(cdc_list,2)
cdc_dom_births

{1: 1276557,
 2: 1288739,
 3: 1304499,
 4: 1288154,
 5: 1299953,
 6: 1304474,
 7: 1310459,
 8: 1312297,
 9: 1303292,
 10: 1320764,
 11: 1314361,
 12: 1318437,
 13: 1277684,
 14: 1320153,
 15: 1319171,
 16: 1315192,
 17: 1324953,
 18: 1326855,
 19: 1318727,
 20: 1324821,
 21: 1322897,
 22: 1317381,
 23: 1293290,
 24: 1288083,
 25: 1272116,
 26: 1284796,
 27: 1294395,
 28: 1307685,
 29: 1223161,
 30: 1202095,
 31: 746696}

In [8]:
# US birth count by day of week
cdc_dow_births = calc_counts(cdc_list,3)
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

## Least and most births by column

In [16]:
# calculate min and max value for a dictionary of int values
def min_max(dico):
    key_lst = list(dico.keys())
    min_val = dico[key_lst[0]]
    min_key = key_lst[0]
    max_val = dico[key_lst[0]]
    max_key = key_lst[0]
    for key in key_lst[1:]:
        if dico[key] > max_val:
            max_val = dico[key]
            max_key = key
        if dico[key] < min_val:
            min_val = dico[key]
            min_key = key
    return {'min':{min_key:min_val},'max':{max_key:max_val}}

In [17]:
# Least 
min_max(cdc_year_births)

{'max': {2003: 4089950}, 'min': {1997: 3880894}}

In [18]:
min_max(cdc_month_births)

{'max': {8: 3525858}, 'min': {2: 3018140}}

In [19]:
min_max(cdc_dom_births)

{'max': {18: 1326855}, 'min': {31: 746696}}

In [20]:
min_max(cdc_dow_births)

{'max': {2: 6446196}, 'min': {7: 4079723}}

## Trend analysis: extract same values from different periods

In [21]:
def calc_counts_conditional(datalist, column, cond_column, cond_value):
    dico = {}
    for data in datalist:
        col = data[column]
        births = data[4]
        if data[cond_column] == cond_value:
            if col in dico:
                dico[col] += births
            else:
                dico[col] = births
    return dico

### Birth counts on Saturday each year:

In [23]:
saturday_birth_count = calc_counts_conditional(cdc_list,0,3,7)
saturday_birth_count

{1994: 428752,
 1995: 425790,
 1996: 413336,
 1997: 404478,
 1998: 407129,
 1999: 401991,
 2000: 416454,
 2001: 397119,
 2002: 391375,
 2003: 393299}

In [29]:
def list_change(dico):
    outlist = []
    keys = sorted(dico.keys())
    for key in keys[1:]:
        outlist.append(dico[key]-dico[key-1])
    return outlist

### How did the number of births on Saturday change each year between 1994 and 2003?

In [30]:
list_change(saturday_birth_count)

[-2962, -12454, -8858, 2651, -5138, 14463, -19335, -5744, 1924]

# Dataset 2: US births 2000-2014 (SSA)

In [31]:
ssa_list = read_csv('US_births_2000-2014_SSA.csv')
ssa_list[0:10]

[[2000, 1, 1, 6, 9083],
 [2000, 1, 2, 7, 8006],
 [2000, 1, 3, 1, 11363],
 [2000, 1, 4, 2, 13032],
 [2000, 1, 5, 3, 12558],
 [2000, 1, 6, 4, 12466],
 [2000, 1, 7, 5, 12516],
 [2000, 1, 8, 6, 8934],
 [2000, 1, 9, 7, 7949],
 [2000, 1, 10, 1, 11668]]

In [32]:
ssa_year_births = calc_counts(ssa_list,0)
ssa_year_births

{2000: 4149598,
 2001: 4110963,
 2002: 4099313,
 2003: 4163060,
 2004: 4186863,
 2005: 4211941,
 2006: 4335154,
 2007: 4380784,
 2008: 4310737,
 2009: 4190991,
 2010: 4055975,
 2011: 4006908,
 2012: 4000868,
 2013: 3973337,
 2014: 4010532}

## Compare difference between 2 datasets

In [35]:
def dict_compare_percentage(dico1, dico2):
    out = {}
    for key in sorted(dico1.keys()):
        if key in dico2:
            diff = float((dico2[key]-dico1[key]))/dico1[key]*100
            out[key] = diff
    return out

In [37]:
dict_compare_percentage(cdc_year_births,ssa_year_births)

{2000: -2.187778189598125,
 2001: -2.0683718145845633,
 2002: -1.892682993467442,
 2003: -1.7561601322104412}