In [1]:
import pandas as pd
import io

def disp(*df):
    for d in df:
        print(d.head())
        print('-----------------------')

pop = pd.read_csv('https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv')
areas = pd.read_csv('https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv')
abbrevs = pd.read_csv('https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv')

In [2]:
disp(pop, areas, abbrevs)

  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
-----------------------
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
-----------------------
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA
-----------------------


In [3]:
pop.shape

(2544, 4)

### 1. 정답

In [4]:
merged = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', right_on='abbreviation')
merged = merged.drop('abbreviation', 1) # drop duplicate info
merged.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


### 2. 정답

In [5]:
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.loc[merged['state/region'] == 'PR', 'state'].head()

2448    Puerto Rico
2449    Puerto Rico
2450    Puerto Rico
2451    Puerto Rico
2452    Puerto Rico
Name: state, dtype: object

### 3. 정답

In [6]:
final = pd.merge(merged, areas, on='state', how='left')
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


### 4. 정답

In [7]:
# query
final.dropna(inplace=True)
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']
density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [9]:
# subset
data2010 = final[(final.year == 2010) & (final.ages == 'total')]
data2010['density'] = data2010['population'] / data2010['area (sq. mi)']
data2010[['state', 'density']].sort_values(by=['density'], ascending=False).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,state,density
389,District of Columbia,8898.897059
2490,Puerto Rico,1058.665149
1445,New Jersey,1009.253268
1914,Rhode Island,681.339159
293,Connecticut,645.600649


In [30]:
# groupby 
group = final.groupby(['year', 'ages'])
data2010 = group.get_group((2010, 'total'))
data2010['density'] = data2010['population'] / data2010['area (sq. mi)']
data2010[['state', 'density']].sort_values(by=['density'], ascending=False).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,state,density
389,District of Columbia,8898.897059
2490,Puerto Rico,1058.665149
1445,New Jersey,1009.253268
1914,Rhode Island,681.339159
293,Connecticut,645.600649
