
# Visualize Data - Transform JSON data to dataframe

**Table of Contents**

1. [Intro](#1.-Intro)
2. [Functions](#2.-Functions)

## 1. Intro

This notebook transform the raw data from both the US Census and Department of Education and transform it from JSON to a csv file. The objective is to flatten de data files so that they can be loaded to a PostgreSQL database on the next phase of the data pipeline.

In [2]:
#%load_ext watermark
import os
import pandas as pd
import numpy as np
import altair as alt
from adjustText import adjust_text 
from matplotlib import pyplot as plt

In [3]:
os.chdir(r"C:\Users\Leo\Documents\Masters\MADS\591 - MIlestone I\aggregated")

In [4]:
census = pd.read_csv('census_agg_zipch.csv')

In [5]:
def population_bubbles(data, variables, theme):
    population_bystate = pd.pivot_table(data[data['year']==2019], index = ['state'], values = variables, aggfunc=np.sum)
    population_bystate['Teens as % of Pop'] = population_bystate[variables[1]] / population_bystate[variables[0]]*100 
    format_dict = {'teen_pop': '{0:,}', 'total_pop': '{0:,}', 'pop_hhi150': '{0:,}', 'pop_hhi220': '{0:,}' , "Teens as % of Pop": '{:.2f}%'}
    population_bystate.reset_index(inplace=True)

    base = alt.Chart(population_bystate)

    bubbles = base.mark_circle().encode(
        alt.X('total_pop:Q', scale = alt.Scale(zero=False), axis= alt.Axis(title= 'Total Population by US State',titleFontSize=8)),
        alt.Y('Teens as % of Pop:Q', scale=alt.Scale(domain = [3,5]),
                                     axis = alt.Axis(title= ["{} Teens 15-17 as % of total population".format(theme)],
                                                     titleFontSize=8
                                                     )
             ),
        alt.Color('state:N', legend=None, scale=alt.Scale(scheme='rainbow')),
        alt.Size('total_pop:Q', scale = alt.Scale(domain = [100000, 8000000]), title='Total Population'),
        tooltip = ['state', alt.Tooltip(field = 'total_pop', type='quantitative', format = ','),
        alt.Tooltip('Teens as % of Pop:Q', format= '.2'),
        alt.Tooltip('teen_pop:Q', format= ',')]
    ).properties(
        width= 300,
        height=130
    ).interactive()#.configure_axisY(labelFontSize=2)
    
    text = base.mark_text(baseline='middle', fontSize=5, fontWeight='bold').encode(
        x = ('total_pop'),
        y=('Teens as % of Pop:Q'),
        text= alt.Text('state')
        )     

    final = alt.hconcat(bubbles+text).properties(title = {'text' : ['Total US Population by state with',
                            'proportion of {} teenagers ages 15-17 for 2019'.format(theme)], 'fontSize': 10, 'anchor':'middle'})#.configure_axisY(labelFontSize=2)

    return final

overall_teencomposition = population_bubbles(census, ['total_pop','teen_pop'], 'Overall')
hispanic_teencomposition = population_bubbles(census, ['total_pop','count_hteens'],'Hispanic' )
black_teencomposition = population_bubbles(census, ['total_pop','count_bteens' ], 'Black')
white_teencomposition = population_bubbles(census, ['total_pop','count_wteens' ], 'White')
upper_byrace = alt.hconcat(overall_teencomposition, hispanic_teencomposition).resolve_scale(color='independent')
lower_byrace = alt.hconcat(black_teencomposition, white_teencomposition).resolve_scale(color='independent')

upper_byrace & lower_byrace

In [6]:
census.head(1)
#teens = census_raw[census_raw['year']==2019].groupby('year')[['Males 15-17', 'Females 15-17']].sum()


Unnamed: 0,year,state,city,zip_code,total_pop,pop_hhi150,pop_hhi220,teen_pop,teen_males,teen_females,count_hteens,hisp_perc,count_bteens,black_perc,count_wteens,white_perc,last_teenpop,avg_teenpopchperc
0,2015,NY,New York,10001,23537,1168.0,7908.0,373,48,325,56,15,172,46,121,32,,-4.75


In [68]:
def teens_by_sex_bar(data):
    teen_by_sex = pd.pivot_table(data[data['year']==2019], index=['state'], values=['teen_pop', 'teen_males', 'teen_females','total_pop'], aggfunc=np.sum)
    
    teen_by_sex['Male % of Teen Pop'] = teen_by_sex['teen_males'] / teen_by_sex['teen_pop'] * 100
    teen_by_sex.reset_index(inplace=True)
    #print(teen_by_sex.sort_values(by='teen_pop', ascending=False))
    #teen_by_sex=teen_by_sex.sort_values(by='total_pop').head(5)
        
    teen_by_race = pd.pivot_table(data[data['year']==2019], index=['state'], values=['teen_males', 'teen_females','count_hteens','count_bteens','count_wteens','teen_pop'], aggfunc=np.sum)
    teen_by_race['total'] = teen_by_race['count_hteens'] + teen_by_race['count_bteens'] + teen_by_race['count_wteens']
    teen_by_race['count_other'] = (teen_by_race['teen_males']+teen_by_race['teen_females']) - teen_by_race['count_hteens'] - teen_by_race['count_bteens'] - teen_by_race['count_wteens']
    teen_by_race.reset_index(inplace=True)
    #print(teen_by_race.sort_values(by='teen_pop', ascending=False))

    #We decided to omit the breakdown by race due to data inconsistencies that would need additional investigation 
    left = alt.Chart(teen_by_race).transform_fold(
        ['count_hteens', 'count_bteens', 'count_wteens'],
        as_= ['column', 'value']
        ).mark_bar().encode(
        x=alt.X('value:Q', sort='descending', title='# of teenagers 15-17'),
        y=alt.Y('state:N', sort='-x', title=None, axis=alt.Axis(labels=False, ticks=False)), 
        tooltip = [alt.Tooltip('count_hteens:Q', format=','), alt.Tooltip('count_bteens:Q', format=','), alt.Tooltip('count_wteens:Q', format=',')],
        color=alt.Color('column:N', 
                scale=alt.Scale(scheme='bluepurple')
                )
        ).transform_window(
                    rank = 'rank(total)',
                    sort = [alt.SortField('sum(total)', order='descending')]
        ).transform_filter(
                    alt.datum.total > 500000
        ).properties(
            title = ['Teenage population 15-17 by race',
                                'by US states']
        )

    mid = alt.Chart(teen_by_sex).transform_fold(
        ['teen_males', 'teen_females'],
        as_=['column', 'value']
        ).mark_text().encode(   
        x = alt.X('value:Q', title=None),   
        y = alt.Y('state:N', sort='-x', title='State', axis=alt.Axis(ticks=False, grid = False))
        #text = alt.Text('state:N') 
        ).transform_window(
                        rank = 'rank(value)',
                        sort = [alt.SortField('sum(value)', order='descending')]
        ).transform_filter(
                        alt.datum.value > 250000
        ).properties(
        width=1
        )   


    right  = alt.Chart(teen_by_sex).transform_fold(
        ['teen_males', 'teen_females'],
        as_=['column', 'value']
        ).mark_bar().encode(       
        x = alt.X('value:Q', title='# of teenagers 15-17'),
        y = alt.Y('state:N', sort='-x' , title = 'state'),
        tooltip = [alt.Tooltip('teen_males:Q', format=','),alt.Tooltip('Male % of Teen Pop', format = '.3'), alt.Tooltip('teen_females:Q', format=',')],
        color = alt.Color('column:N', title='Sex',
                scale=alt.Scale(scheme='bluepurple'))
        ).transform_window(
                        rank = 'rank(value)',
                        sort = [alt.SortField('sum(value))', order='descending')]
                    ).transform_filter(
                        alt.datum.value> 245000    
        ).properties(title = ['Teenager population 15-17 by sex', 
                            'by US states 2019'])

    final = alt.hconcat(right)
    
    return final

teens_by_sex_bar(census)

In [10]:
def state_groupby(data, state_list, levels, region, debug=False):
    data = data[(data['state'].isin(state_list))&(data['total_pop']>1000)].copy()
    datagrouped = data.groupby(['year', 'state', 'city'])['total_pop', 'teen_pop'].sum()
    if debug:
        print(datagrouped.head(1))
    datagrouped['teen_pop_change']= datagrouped.sort_values('year').groupby(levels)['teen_pop'].pct_change()*100
    datagrouped['overall_pop_change']= datagrouped.sort_values('year').groupby(levels)['total_pop'].pct_change()*100
    
    data_change = pd.pivot_table(datagrouped, index= levels, values = ['total_pop','overall_pop_change', 'teen_pop', 'teen_pop_change'], aggfunc=np.mean)
    data_change.reset_index(inplace= True)
    if debug:
        print(data_change.head(4))

    alt.data_transformers.disable_max_rows()
    base = alt.Chart(data_change).mark_point(
        size = 30, 
        filled=True, 
        opacity=0.4, 
        clip=True
        ).encode(
        x= alt.X('overall_pop_change:Q', scale=alt.Scale(zero=False, domain = [-30,40]), title='Average Overall Population Change'),
        y = alt.Y('teen_pop_change', scale=alt.Scale(zero=False, domain=(-100,200)), title='Average Teen Population Change'),
        color = alt.Color('state:N', title='State', scale = alt.Scale(scheme = 'rainbow')),
        tooltip = ['state',  'city', 'total_pop','teen_pop']
    ).properties(title = {'text': ['5-year average change of Overall vs Teenager Population',
                            'by city with populations > 1000 in {} {} states'.format(len(state_list), region)]},
                width = 150,
                height = 150
    ).interactive()
    return base


midwest = ['MI','OH','IL', 'IN','MN','WI', 'IA','MO']
south = ['TX','LA','GA','FL','SC','MS','SC','TN', 'AL']
west = ['CA','OR','WA','NV','AZ','ID','UT','MT']
east = ['NY','NJ','MA','RI','DE','VA','PA','CT']

midwest_1000ch = state_groupby(census, midwest, ['state', 'city'], 'Midwest')
south_1000ch = state_groupby(census, south, ['state', 'city'],'Southern')
west_1000ch = state_groupby(census, west, ['state', 'city'],'Western')
east_1000ch = state_groupby(census, east, ['state', 'city'],'Eastern')
upper = alt.hconcat(midwest_1000ch, west_1000ch).resolve_scale(color='independent')
lower = alt.hconcat(east_1000ch, south_1000ch).resolve_scale(color='independent')

upper & lower

  datagrouped = data.groupby(['year', 'state', 'city'])['total_pop', 'teen_pop'].sum()
