In [1]:
import pandas as pd
import numpy as np

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args

    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)

    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [2]:
import pandas as pd
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
pop = pd.read_csv('state-population.csv')


display('pop.head()', 'areas.head()', 'abbrevs.head()')


Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489
1,AL,total,2012,4817528
2,AL,under18,2010,1130966
3,AL,total,2010,4785570
4,AL,under18,2011,1125763

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


## Task1: Rank US states & territories by their 2010 population density

We’ll start with a many-to-one merge which will give us the full state name within the population dataframe. We want to merge based on the "state/region" column of pop, and the "abbreviation" column of abbrevs. We’ll use how='outer' to make sure no data is thrown away due to mis-matched labels.

In [3]:
merged = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', right_on='abbreviation')
    

In [5]:
merged.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489,Alabama,AL
1,AL,total,2012,4817528,Alabama,AL
2,AL,under18,2010,1130966,Alabama,AL
3,AL,total,2010,4785570,Alabama,AL
4,AL,under18,2011,1125763,Alabama,AL


In [6]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
abbreviation     True
dtype: bool

In [7]:
merged[merged['population'].isnull()]

Unnamed: 0,state/region,ages,year,population,state,abbreviation
2448,PR,under18,1990,,,
2449,PR,total,1990,,,
2450,PR,total,1991,,,
2451,PR,under18,1991,,,
2452,PR,total,1993,,,
2453,PR,under18,1993,,,
2454,PR,under18,1992,,,
2455,PR,total,1992,,,
2456,PR,under18,1994,,,
2457,PR,total,1994,,,


We can quickly infer the issue: our population data includes entries for Puerto Rico (PR) and the United States as a whole (USA), while these entries do not appear in the state abbreviation key. We can fix these quickly by filling-in appropriate entries:

In [8]:
merged['state/region'][merged['state'].isnull()].unique()

array(['PR', 'USA'], dtype=object)

In [9]:
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()


state/region    False
ages            False
year            False
population       True
state           False
abbreviation     True
dtype: bool

In [10]:
final = pd.merge(merged, areas, on='state', how='left')
final.head()


Unnamed: 0,state/region,ages,year,population,state,abbreviation,area (sq. mi)
0,AL,under18,2012,1117489,Alabama,AL,52423
1,AL,total,2012,4817528,Alabama,AL,52423
2,AL,under18,2010,1130966,Alabama,AL,52423
3,AL,total,2010,4785570,Alabama,AL,52423
4,AL,under18,2011,1125763,Alabama,AL,52423


In [11]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
abbreviation      True
area (sq. mi)     True
dtype: bool

There are NULLs in the area column; we can take a look to see which regions were ignored here:

In [12]:
final['state'][final['area (sq. mi)'].isnull()].unique()

array(['United States'], dtype=object)

We see that our areas DataFrame does not contain the area of the United States as a whole. We could insert the appropriate value (using, e.g. the sum of all state areas), but in this case we’ll just drop the null values because the population density of the entire US is not relevant to our current discussion:

In [13]:
final.dropna(inplace=True)
final.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation,area (sq. mi)
0,AL,under18,2012,1117489,Alabama,AL,52423
1,AL,total,2012,4817528,Alabama,AL,52423
2,AL,under18,2010,1130966,Alabama,AL,52423
3,AL,total,2010,4785570,Alabama,AL,52423
4,AL,under18,2011,1125763,Alabama,AL,52423


Now we have all the data we need. To answer the question of interest, let’s first select the portion of the data corresponding with the year 2010, and the total population. 

In [14]:
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation,area (sq. mi)
3,AL,total,2010,4785570,Alabama,AL,52423
91,AK,total,2010,713868,Alaska,AK,656425
101,AZ,total,2010,6408790,Arizona,AZ,114006
189,AR,total,2010,2922280,Arkansas,AR,53182
197,CA,total,2010,37333601,California,CA,163707


Now let’s compute the population density and display it in order. We’ll start by re-indexing our data on the state, and then compute the result.

In [15]:
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']

In [16]:
density.sort(ascending=False)
density.head()

  if __name__ == '__main__':


state
District of Columbia    8898.897059
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
Massachusetts            621.815538
dtype: float64

In [17]:
density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64