# Dataquest Curriculum Project 1: Birth Dates in the United States

Here, we familiarize ourselves with the very basics of Jupyter by accessing some data on births in the United States from 1994-2003. The [source](https://github.com/fivethirtyeight/data/tree/master/births) of this data is from [FiveThirtyEight](https://github.com/fivethirtyeight), who originally sourced it from the Center for Disease Control and Prevention's National Center for Health Statistics. This is the structure of the data set:

- `year` - Year
- `month` - Month
- `date_of_month` - Day number of the month
- `day_of_week` - Day of week, where 1 is Monday and 7 is Sunday
- `births` - Number of births

# Practice parsing a CSV

In [7]:
f = open("US_births_1994-2003_CDC_NCHS.csv", 'r')
text = f.read()
text[:1000]

'year,month,date_of_month,day_of_week,births\n1994,1,1,6,8096\n1994,1,2,7,7772\n1994,1,3,1,10142\n1994,1,4,2,11248\n1994,1,5,3,11053\n1994,1,6,4,11406\n1994,1,7,5,11251\n1994,1,8,6,8653\n1994,1,9,7,7910\n1994,1,10,1,10498\n1994,1,11,2,11706\n1994,1,12,3,11567\n1994,1,13,4,11212\n1994,1,14,5,11570\n1994,1,15,6,8660\n1994,1,16,7,8123\n1994,1,17,1,10567\n1994,1,18,2,11541\n1994,1,19,3,11257\n1994,1,20,4,11682\n1994,1,21,5,11811\n1994,1,22,6,8833\n1994,1,23,7,8310\n1994,1,24,1,11125\n1994,1,25,2,11981\n1994,1,26,3,11514\n1994,1,27,4,11702\n1994,1,28,5,11666\n1994,1,29,6,8988\n1994,1,30,7,8096\n1994,1,31,1,10765\n1994,2,1,2,11755\n1994,2,2,3,11483\n1994,2,3,4,11523\n1994,2,4,5,11677\n1994,2,5,6,8991\n1994,2,6,7,8309\n1994,2,7,1,10984\n1994,2,8,2,12152\n1994,2,9,3,11515\n1994,2,10,4,11623\n1994,2,11,5,11517\n1994,2,12,6,8945\n1994,2,13,7,8171\n1994,2,14,1,11551\n1994,2,15,2,12164\n1994,2,16,3,12009\n1994,2,17,4,11674\n1994,2,18,5,11887\n1994,2,19,6,8946\n1994,2,20,7,8402\n1994,2,21,1,10617\n

In [8]:
print(text.split('\n')[:10])

['year,month,date_of_month,day_of_week,births', '1994,1,1,6,8096', '1994,1,2,7,7772', '1994,1,3,1,10142', '1994,1,4,2,11248', '1994,1,5,3,11053', '1994,1,6,4,11406', '1994,1,7,5,11251', '1994,1,8,6,8653', '1994,1,9,7,7910']


In [9]:
text.split('\n')[:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

In [11]:
data = []
for element in text.split('\n'):
    row = element.split(',')
    data.append(row)
    
data[:10]

[['year', 'month', 'date_of_month', 'day_of_week', 'births'],
 ['1994', '1', '1', '6', '8096'],
 ['1994', '1', '2', '7', '7772'],
 ['1994', '1', '3', '1', '10142'],
 ['1994', '1', '4', '2', '11248'],
 ['1994', '1', '5', '3', '11053'],
 ['1994', '1', '6', '4', '11406'],
 ['1994', '1', '7', '5', '11251'],
 ['1994', '1', '8', '6', '8653'],
 ['1994', '1', '9', '7', '7910']]

# Calculate Day-of-Week Totals

In [5]:
day_of_week_totals = {}
data_no_header = data[1:]

for row in data_no_header:
    # row[3] accesses which day of the week it is, 
    # and row[4] accesses number of births
    if row[3] in day_of_week_totals:
        day_of_week_totals[row[3]] += int(row[4])
    else:
        day_of_week_totals[row[3]] = int(row[4])
        
day_of_week_totals

{'1': 5789166,
 '2': 6446196,
 '3': 6322855,
 '4': 6288429,
 '5': 6233657,
 '6': 4562111,
 '7': 4079723}

# Parsing a CSV as a function

In [13]:
# Accepts a csv file name as a string, parses rows by \n, columns by comma,
# and returns as a List of Lists the (int) data
# Assumes no header row is present. If a header row is present, set header_row
# to True so that the function omits it from the List of Lists
def read_csv(filename_str, header_row=False):
    
    file = open(filename_str, 'r')
    raw = file.read()
    string_list = raw.split('\n')
    
    if (header_row):
        string_list = string_list[1:]
        
    final_list = []
    for line in string_list:
        int_fields = []
        string_fields = line.split(',')
        
        for element in string_fields:
            #convert everything in a line to ints
            int_fields.append(int(element))
            
        final_list.append(int_fields)
        
    return final_list


cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv", header_row=True)
cdc_list[:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

# Calculate Monthly Totals as a function

In [15]:
def month_births(list_of_lists):
    births_per_month = {}
    for row in list_of_lists:
        month = row[1]
        births = row[4]
        
        if month in births_per_month:
            births_per_month[month] += births
        else:
            births_per_month[month] = births
        
    return births_per_month

cdc_months_births = month_births(cdc_list)
cdc_months_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

# Calculate Day-of-Week Totals as a function

In [16]:
def dow_births(list_of_lists):
    births_per_dow = {}
    for row in list_of_lists:
        dow = row[3]
        births = row[4]
        
        if dow in births_per_dow:
            births_per_dow[dow] += births
        else:
            births_per_dow[dow] = births
            
    return births_per_dow

cdc_day_births = dow_births(cdc_list)
cdc_day_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

# Abstraction: Calculate any category's totals as a function

In [17]:
# data - a list of lists
# column - the column number we want to calculate the totals for
def calc_counts(data, column):
    births_per_category = {}
    for row in data:
        category = row[column]
        births = row[4]
        
        if category in births_per_category:
            births_per_category[category] += births
        else:
            births_per_category[category] = births
            
    return births_per_category

cdc_year_births = calc_counts(cdc_list, 0)
cdc_month_births = calc_counts(cdc_list, 1)
cdc_dom_births = calc_counts(cdc_list, 2)
cdc_dow_births = calc_counts(cdc_list, 3)

In [18]:
cdc_year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

In [19]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [20]:
cdc_dom_births

{1: 1276557,
 2: 1288739,
 3: 1304499,
 4: 1288154,
 5: 1299953,
 6: 1304474,
 7: 1310459,
 8: 1312297,
 9: 1303292,
 10: 1320764,
 11: 1314361,
 12: 1318437,
 13: 1277684,
 14: 1320153,
 15: 1319171,
 16: 1315192,
 17: 1324953,
 18: 1326855,
 19: 1318727,
 20: 1324821,
 21: 1322897,
 22: 1317381,
 23: 1293290,
 24: 1288083,
 25: 1272116,
 26: 1284796,
 27: 1294395,
 28: 1307685,
 29: 1223161,
 30: 1202095,
 31: 746696}

In [21]:
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

# Calculate max & min for a given dictionary

In [24]:
# returns as list [max, min]
def max_min(dictionary):
    maximum = float('-inf')
    minimum = float('inf')
    
    for key in dictionary:
        value = dictionary[key]
        if value > maximum:
            maximum = value
        if value < minimum:
            minimum = value
            
    return [maximum, minimum]

max_min(cdc_dow_births)

[6446196, 4079723]

# Calculate changes across years

In [25]:
# data - a list of lists
# column - the column number where the category is located
# category - the category (ex: Saturday, or Jan, or the 2nd of each month as int)                                    
# returns a dictionary relating a year to its birth totals for a given category

def category_totals(data, column, category):
    year_totals = {}
    for row in data:
        year = row[0]
        births = row[4]
        if row[column] == category:
            if year in year_totals:
                year_totals[year] += births
            else:
                year_totals[year] = births
    return year_totals

In [26]:
# Total births on Saturdays each year
# DOW is column 3, Saturday is int 6
category_totals(cdc_list, 3, 6)

{1994: 474732,
 1995: 459580,
 1996: 456261,
 1997: 450840,
 1998: 453776,
 1999: 449985,
 2000: 469794,
 2001: 453928,
 2002: 445770,
 2003: 447445}

In [27]:
# Total births on January each year
# Month is column 1, January is int 1
category_totals(cdc_list, 1, 1)

{1994: 320705,
 1995: 316013,
 1996: 314283,
 1997: 317211,
 1998: 319340,
 1999: 319182,
 2000: 330108,
 2001: 335198,
 2002: 330674,
 2003: 329803}

In [28]:
# Total births on the 2nd day of every month each year
# DOM is column 2, 2nd day of every month is int 2
category_totals(cdc_list, 2, 2)

{1994: 128688,
 1995: 123816,
 1996: 126392,
 1997: 124563,
 1998: 132042,
 1999: 134071,
 2000: 127447,
 2001: 132651,
 2002: 127603,
 2003: 131466}

# How did the number of births on Saturday change each year between 1994 and 2003?

In [33]:
# given dictionary relating int year : int total
def increasing_or_decreasing(dictionary):
    changes = {}
    # for each year, peeps at the previous year
    for key in dictionary:
        if key-1 not in dictionary:
            changes[key] = "unknown"
        elif dictionary[key-1] < dictionary[key]:
            changes[key] = "increasing"
        elif dictionary[key-1] > dictionary[key]:
            changes[key] = "decreasing"
        else: # rare chance they are equal
            changes[key] = "no change"
    return changes

saturday_totals = category_totals(cdc_list, 3, 6)
saturday_totals

{1994: 474732,
 1995: 459580,
 1996: 456261,
 1997: 450840,
 1998: 453776,
 1999: 449985,
 2000: 469794,
 2001: 453928,
 2002: 445770,
 2003: 447445}

In [34]:
increasing_or_decreasing(saturday_totals)

{1994: 'unknown',
 1995: 'decreasing',
 1996: 'decreasing',
 1997: 'decreasing',
 1998: 'increasing',
 1999: 'decreasing',
 2000: 'increasing',
 2001: 'decreasing',
 2002: 'decreasing',
 2003: 'increasing'}

# Combine CDC birth data with Social Security Administration birth data
The [source](https://github.com/fivethirtyeight/data/tree/master/births) of this data is again from [FiveThirtyEight](https://github.com/fivethirtyeight), who this time sourced it from the Social Security Adminstration (SSA).

The SSA data covers 2000-2014. For overlapping time periods, the birth counts will be averaged between the two sources (decimals truncated).

In [35]:
ssa_list = read_csv("US_births_2000-2014_SSA.csv", header_row=True)
ssa_list[:10]

[[2000, 1, 1, 6, 9083],
 [2000, 1, 2, 7, 8006],
 [2000, 1, 3, 1, 11363],
 [2000, 1, 4, 2, 13032],
 [2000, 1, 5, 3, 12558],
 [2000, 1, 6, 4, 12466],
 [2000, 1, 7, 5, 12516],
 [2000, 1, 8, 6, 8934],
 [2000, 1, 9, 7, 7949],
 [2000, 1, 10, 1, 11668]]

In [44]:
combined_list = [] # the combined data
did_it_work = []

cdc_index = 0 # keep track for the overlap
ssa_index = 0 # keep track for after the overlap

# add all rows before year 2000 - CDC only
for row in cdc_list:
    year = row[0]
    if year >= 2000:
        break
    else:
        combined_list.append(row.copy())
        cdc_index += 1 

# add all rows between 2000 and 2003 - average the births
for row in ssa_list:
    year = row[0]
    if year >= 2004:
        break
    else:
        # the respective birth counts
        cdc_row = cdc_list[cdc_index]
        ssa_births = row[4]
        cdc_births = cdc_row[4]
        
        # average the two in the new row
        new_row = row.copy()
        new_row[4] = int((ssa_births + cdc_births) / 2)
        combined_list.append(new_row)
        
        # keep a record to ensure it's correct
        str_record = "Date SSA: " + str(row[0]) + "/" + str(row[1]) + "/" + str(row[2]) + ", "
        str_record += "Date CDC: " + str(cdc_row[0]) + "/" + str(cdc_row[1]) + "/" + str(cdc_row[2]) + ", "
        str_record += "SSA: " + str(ssa_births) + ", "
        str_record += "CDC: " + str(cdc_births) + ", "
        str_record += "Avg: " + str(new_row[4])
        did_it_work.append(str_record)
        
        # keep track of indices
        cdc_index += 1
        ssa_index += 1
        
# add all rows after year 2003 - SSA only
for row in ssa_list[ssa_index:]:
    combined_list.append(row)
    
# check if it worked
did_it_work[:10]

['Date SSA: 2000/1/1, Date CDC: 2000/1/1, SSA: 9083, CDC: 8843, Avg: 8963',
 'Date SSA: 2000/1/2, Date CDC: 2000/1/2, SSA: 8006, CDC: 7816, Avg: 7911',
 'Date SSA: 2000/1/3, Date CDC: 2000/1/3, SSA: 11363, CDC: 11123, Avg: 11243',
 'Date SSA: 2000/1/4, Date CDC: 2000/1/4, SSA: 13032, CDC: 12703, Avg: 12867',
 'Date SSA: 2000/1/5, Date CDC: 2000/1/5, SSA: 12558, CDC: 12240, Avg: 12399',
 'Date SSA: 2000/1/6, Date CDC: 2000/1/6, SSA: 12466, CDC: 12260, Avg: 12363',
 'Date SSA: 2000/1/7, Date CDC: 2000/1/7, SSA: 12516, CDC: 12280, Avg: 12398',
 'Date SSA: 2000/1/8, Date CDC: 2000/1/8, SSA: 8934, CDC: 8750, Avg: 8842',
 'Date SSA: 2000/1/9, Date CDC: 2000/1/9, SSA: 7949, CDC: 7736, Avg: 7842',
 'Date SSA: 2000/1/10, Date CDC: 2000/1/10, SSA: 11668, CDC: 11418, Avg: 11543']

## Check if it worked - examine Dec 1999 & Jan 2000

In [49]:
print("CDC from Dec 1999 to Jan 2000, inclusive")
for row in cdc_list:
    year = row[0]
    month = row[1]
    if (year == 1999 and month == 12) or (year == 2000 and month == 1):
        print(row)

CDC from Dec 1999 to Jan 2000, inclusive
[1999, 12, 1, 3, 12282]
[1999, 12, 2, 4, 11976]
[1999, 12, 3, 5, 11591]
[1999, 12, 4, 6, 8333]
[1999, 12, 5, 7, 7503]
[1999, 12, 6, 1, 10956]
[1999, 12, 7, 2, 12329]
[1999, 12, 8, 3, 11849]
[1999, 12, 9, 4, 11728]
[1999, 12, 10, 5, 11459]
[1999, 12, 11, 6, 8442]
[1999, 12, 12, 7, 7528]
[1999, 12, 13, 1, 10830]
[1999, 12, 14, 2, 12667]
[1999, 12, 15, 3, 12623]
[1999, 12, 16, 4, 12441]
[1999, 12, 17, 5, 12453]
[1999, 12, 18, 6, 8979]
[1999, 12, 19, 7, 7732]
[1999, 12, 20, 1, 12497]
[1999, 12, 21, 2, 13508]
[1999, 12, 22, 3, 12166]
[1999, 12, 23, 4, 10591]
[1999, 12, 24, 5, 8046]
[1999, 12, 25, 6, 6674]
[1999, 12, 26, 7, 7432]
[1999, 12, 27, 1, 11579]
[1999, 12, 28, 2, 13158]
[1999, 12, 29, 3, 12629]
[1999, 12, 30, 4, 11935]
[1999, 12, 31, 5, 9335]
[2000, 1, 1, 6, 8843]
[2000, 1, 2, 7, 7816]
[2000, 1, 3, 1, 11123]
[2000, 1, 4, 2, 12703]
[2000, 1, 5, 3, 12240]
[2000, 1, 6, 4, 12260]
[2000, 1, 7, 5, 12280]
[2000, 1, 8, 6, 8750]
[2000, 1, 9, 7, 7736]


In [50]:
print("Combined from Dec 1999 to Jan 2000, inclusive")
for row in combined_list:
    year = row[0]
    month = row[1]
    if (year == 1999 and month == 12) or (year == 2000 and month == 1):
        print(row)

Combined from Dec 1999 to Jan 2000, inclusive
[1999, 12, 1, 3, 12282]
[1999, 12, 2, 4, 11976]
[1999, 12, 3, 5, 11591]
[1999, 12, 4, 6, 8333]
[1999, 12, 5, 7, 7503]
[1999, 12, 6, 1, 10956]
[1999, 12, 7, 2, 12329]
[1999, 12, 8, 3, 11849]
[1999, 12, 9, 4, 11728]
[1999, 12, 10, 5, 11459]
[1999, 12, 11, 6, 8442]
[1999, 12, 12, 7, 7528]
[1999, 12, 13, 1, 10830]
[1999, 12, 14, 2, 12667]
[1999, 12, 15, 3, 12623]
[1999, 12, 16, 4, 12441]
[1999, 12, 17, 5, 12453]
[1999, 12, 18, 6, 8979]
[1999, 12, 19, 7, 7732]
[1999, 12, 20, 1, 12497]
[1999, 12, 21, 2, 13508]
[1999, 12, 22, 3, 12166]
[1999, 12, 23, 4, 10591]
[1999, 12, 24, 5, 8046]
[1999, 12, 25, 6, 6674]
[1999, 12, 26, 7, 7432]
[1999, 12, 27, 1, 11579]
[1999, 12, 28, 2, 13158]
[1999, 12, 29, 3, 12629]
[1999, 12, 30, 4, 11935]
[1999, 12, 31, 5, 9335]
[2000, 1, 1, 6, 8963]
[2000, 1, 2, 7, 7911]
[2000, 1, 3, 1, 11243]
[2000, 1, 4, 2, 12867]
[2000, 1, 5, 3, 12399]
[2000, 1, 6, 4, 12363]
[2000, 1, 7, 5, 12398]
[2000, 1, 8, 6, 8842]
[2000, 1, 9, 7, 7

**We've demonstrated that 1999 (no overlap) is unchanged, but 2000 (overlap) is changed**

## Check if it worked - examine Dec 2003 & Jan 2004

In [51]:
print("SSA from Dec 2003 to Jan 2004, inclusive")
for row in ssa_list:
    year = row[0]
    month = row[1]
    if (year == 2003 and month == 12) or (year == 2004 and month == 1):
        print(row)

SSA from Dec 2003 to Jan 2004, inclusive
[2003, 12, 1, 1, 12213]
[2003, 12, 2, 2, 13454]
[2003, 12, 3, 3, 13199]
[2003, 12, 4, 4, 12631]
[2003, 12, 5, 5, 12180]
[2003, 12, 6, 6, 8247]
[2003, 12, 7, 7, 7383]
[2003, 12, 8, 1, 11708]
[2003, 12, 9, 2, 13001]
[2003, 12, 10, 3, 12739]
[2003, 12, 11, 4, 12502]
[2003, 12, 12, 5, 12484]
[2003, 12, 13, 6, 8303]
[2003, 12, 14, 7, 7462]
[2003, 12, 15, 1, 12319]
[2003, 12, 16, 2, 13649]
[2003, 12, 17, 3, 13454]
[2003, 12, 18, 4, 13865]
[2003, 12, 19, 5, 13693]
[2003, 12, 20, 6, 8953]
[2003, 12, 21, 7, 7657]
[2003, 12, 22, 1, 13249]
[2003, 12, 23, 2, 12939]
[2003, 12, 24, 3, 9272]
[2003, 12, 25, 4, 6744]
[2003, 12, 26, 5, 10431]
[2003, 12, 27, 6, 8785]
[2003, 12, 28, 7, 7763]
[2003, 12, 29, 1, 13125]
[2003, 12, 30, 2, 14700]
[2003, 12, 31, 3, 12540]
[2004, 1, 1, 4, 8205]
[2004, 1, 2, 5, 10586]
[2004, 1, 3, 6, 8337]
[2004, 1, 4, 7, 7359]
[2004, 1, 5, 1, 11489]
[2004, 1, 6, 2, 12755]
[2004, 1, 7, 3, 12716]
[2004, 1, 8, 4, 12781]
[2004, 1, 9, 5, 12469]

In [53]:
print("Combined from Dec 2003 to Jan 2004, inclusive")
for row in combined_list:
    year = row[0]
    month = row[1]
    if (year == 2003 and month == 12) or (year == 2004 and month == 1):
        print(row)

Combined from Dec 2003 to Jan 2004, inclusive
[2003, 12, 1, 1, 12093]
[2003, 12, 2, 2, 13288]
[2003, 12, 3, 3, 13041]
[2003, 12, 4, 4, 12501]
[2003, 12, 5, 5, 12040]
[2003, 12, 6, 6, 8189]
[2003, 12, 7, 7, 7312]
[2003, 12, 8, 1, 11602]
[2003, 12, 9, 2, 12850]
[2003, 12, 10, 3, 12604]
[2003, 12, 11, 4, 12372]
[2003, 12, 12, 5, 12360]
[2003, 12, 13, 6, 8226]
[2003, 12, 14, 7, 7399]
[2003, 12, 15, 1, 12191]
[2003, 12, 16, 2, 13499]
[2003, 12, 17, 3, 13280]
[2003, 12, 18, 4, 13729]
[2003, 12, 19, 5, 13563]
[2003, 12, 20, 6, 8874]
[2003, 12, 21, 7, 7602]
[2003, 12, 22, 1, 13108]
[2003, 12, 23, 2, 12768]
[2003, 12, 24, 3, 9184]
[2003, 12, 25, 4, 6686]
[2003, 12, 26, 5, 10324]
[2003, 12, 27, 6, 8715]
[2003, 12, 28, 7, 7704]
[2003, 12, 29, 1, 12974]
[2003, 12, 30, 2, 14569]
[2003, 12, 31, 3, 12457]
[2004, 1, 1, 4, 8205]
[2004, 1, 2, 5, 10586]
[2004, 1, 3, 6, 8337]
[2004, 1, 4, 7, 7359]
[2004, 1, 5, 1, 11489]
[2004, 1, 6, 2, 12755]
[2004, 1, 7, 3, 12716]
[2004, 1, 8, 4, 12781]
[2004, 1, 9, 5, 1

**We've demonstrated that 2003 (overlap) is changed, but 2004 (no overlap) is unchanged**