# US States Data

## Rank US states and territoris by their 2010 population density

In [None]:
import pandas as pd

In [None]:
# Download the data
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv


In [46]:
# Read data as DataFrames
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')

# Outer join to make sure no data is thrown away due to mismatched labels
merged = pd.merge(pop, abbrevs, how = 'outer',
                 left_on = 'state/region', right_on = 'abbreviation')

# drop duplicate info
merged = merged.drop('abbreviation', 1) 

# check null
merged.isnull().any()

merged[ merged['population'].isnull() ]
# all the null population values are from Puerto Rico prior to the year 2000;

# some of the new state entries are also null, 
# which means that there was no corresponding entry in the abbrevs key! 
# Let’s figure out which regions lack this match
merged.loc[merged['state'].isnull(), 'state/region'].unique()

# fix these by filling in appropriate entries
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'

# Join the areas df
final = pd.merge(merged, areas, on = 'state', how = 'left')

# Check null
final.isnull().any()

final['state'][final['area (sq. mi)'].isnull()].unique()
# areas DataFrame does not contain the area of the United States as a whole.

# using the sum of all state areas
usarea = final['area (sq. mi)'].dropna().unique().sum()

# fill in the total area
a = final.loc[:, 'area (sq. mi)'][final['area (sq. mi)'].isnull()].fillna(usarea, inplace = True)

# let’s first select the portion of the data 
# corresponding with the year 2000, and the total population
data2010 = final.query("year == 2010 & ages == 'total'")

# compute the population density 
data2010.set_index('state', inplace = True)
density = data2010['population'] / data2010['area (sq. mi)']

# display it in order
density.sort_values(ascending = False, inplace = True)