In [1]:
import numpy as np
import pandas as pd

In [2]:
# Helper Function to display the Dataframes in horizontal way

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

**Data Set Details**
* File Name: state-population.csv - Population details of US States
* File Name: state-abbrevs.csv    - State Abbrevations and state names
* File Name: state-areas.csv      - Area details of the state

#### Question: Rank US States and territories by their 2010 population density

#### Import the DataSet

In [3]:
population = pd.read_csv("..\DataSets\state-population.csv")
stateabb = pd.read_csv("..\DataSets\state-abbrevs.csv")
statearea = pd.read_csv("..\DataSets\state-areas.csv")
display('population.head()', 'stateabb.head()','statearea.head()')

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


----

#### Data Cleaning

From checking the details of the dataset `population`, we can see there are many records pertaining to years which is not concern for the question asked. Also dataset has details of the `under18` population, which is not required. 

Dataset is updated to retain only required data in population dataset. Rest all deleted. 

In [4]:
# Remove all unwanted records

population = population[(population['year'] == 2010) & (population['ages'] == 'total')]
population.drop(labels='ages', axis=1, inplace=True)
population.head()

Unnamed: 0,state/region,year,population
3,AL,2010,4785570.0
91,AK,2010,713868.0
101,AZ,2010,6408790.0
189,AR,2010,2922280.0
197,CA,2010,37333601.0


#### Checking for any NaN and Null Values in the dataset

In [5]:
population.isnull().any()

state/region    False
year            False
population      False
dtype: bool

In [6]:
stateabb.isnull().any()

state           False
abbreviation    False
dtype: bool

In [7]:
statearea.isnull().any()

state            False
area (sq. mi)    False
dtype: bool

#### Merge the datasets

* Dataset `population` and `abbrevation` is merged based on their respective columns
* Duplicate `abbreviation` column created during merge is deleted
* Resulting dataset is saved in new DataFrame `df_data`

In [8]:
df_data = pd.merge(population, stateabb, left_on='state/region', right_on='abbreviation').drop('abbreviation', axis=1)
df_data.head()

Unnamed: 0,state/region,year,population,state
0,AL,2010,4785570.0,Alabama
1,AK,2010,713868.0,Alaska
2,AZ,2010,6408790.0,Arizona
3,AR,2010,2922280.0,Arkansas
4,CA,2010,37333601.0,California


* Dataset `df_data` and `statearea` is merged based on their respective columns

In [9]:
df_data = pd.merge(df_data, statearea, left_on='state', right_on='state')
df_data.head()

Unnamed: 0,state/region,year,population,state,area (sq. mi)
0,AL,2010,4785570.0,Alabama,52423
1,AK,2010,713868.0,Alaska,656425
2,AZ,2010,6408790.0,Arizona,114006
3,AR,2010,2922280.0,Arkansas,53182
4,CA,2010,37333601.0,California,163707


In [10]:
df_data.isnull().any()

state/region     False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool

In [11]:
# Create a new column for population density

df_data['density'] = df_data['population'] / df_data['area (sq. mi)']

In [12]:
# Sort the dataset accoding to the density column

df_data.sort_values('density', ascending=False, inplace=True)

In [13]:
# Display top & Bottom  5 states

display("df_data[['state','density']].head(5)","df_data[['state','density']].tail(5)")

Unnamed: 0,state,density
8,District of Columbia,8898.897059
30,New Jersey,1009.253268
39,Rhode Island,681.339159
6,Connecticut,645.600649
21,Massachusetts,621.815538

Unnamed: 0,state,density
41,South Dakota,10.583512
34,North Dakota,9.537565
26,Montana,6.736171
50,Wyoming,5.768079
1,Alaska,1.087509


*****