In [2]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

- 需求：
    - 导入文件，查看原始数据
    - 将人口数据和各州简称数据进行合并
    - 将合并的数据中重复的abbreviation列进行删除
    - 查看存在缺失数据的列
    - 找到有哪些state/region使得state的值为NaN，进行去重操作
    - 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
    - 合并各州面积数据areas
    - 我们会发现area(sq.mi)这一列有缺失数据，找出是哪些行
    - 去除含有缺失数据的行
    - 找出2010年的全民人口数据
    - 计算各州的人口密度
    - 排序，并找出人口密度最高的州

In [6]:
abb = pd.read_csv('./data/state-abbrevs.csv')
abb.head(3)

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ


In [7]:
pop = pd.read_csv('./data/state-population.csv')
pop.head(3)

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0


In [8]:
area = pd.read_csv('./data/state-areas.csv')
area.head(3)

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006


In [9]:
#将人口数据和各州简称数据进行合并
abb_pop = pd.merge(left=abb,right=pop,how='outer',left_on='abbreviation',right_on='state/region')
abb_pop.head(3)

Unnamed: 0,state,abbreviation,state/region,ages,year,population
0,Alabama,AL,AL,under18,2012,1117489.0
1,Alabama,AL,AL,total,2012,4817528.0
2,Alabama,AL,AL,under18,2010,1130966.0


In [10]:
#将合并的数据中重复的abbreviation列进行删除
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)

In [11]:
abb_pop.head(3)

Unnamed: 0,state,state/region,ages,year,population
0,Alabama,AL,under18,2012,1117489.0
1,Alabama,AL,total,2012,4817528.0
2,Alabama,AL,under18,2010,1130966.0


In [12]:
#查看存在缺失数据的列
abb_pop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
state           2448 non-null object
state/region    2544 non-null object
ages            2544 non-null object
year            2544 non-null int64
population      2524 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB


In [14]:
abb_pop.isnull().any(axis=0)

state            True
state/region    False
ages            False
year            False
population       True
dtype: bool

In [15]:
#找到有哪些state/region使得state的值为NaN，进行去重操作
#将州的全称为空对应的简称找出来，且对其进行去重操作
abb_pop.head()

Unnamed: 0,state,state/region,ages,year,population
0,Alabama,AL,under18,2012,1117489.0
1,Alabama,AL,total,2012,4817528.0
2,Alabama,AL,under18,2010,1130966.0
3,Alabama,AL,total,2010,4785570.0
4,Alabama,AL,under18,2011,1125763.0


In [16]:
#1.找到state这一列中哪些值为空
abb_pop['state'].isnull()

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
2514     True
2515     True
2516     True
2517     True
2518     True
2519     True
2520     True
2521     True
2522     True
2523     True
2524     True
2525     True
2526     True
2527     True
2528     True
2529     True
2530     True
2531     True
2532     True
2533     True
2534     True
2535     True
2536     True
2537     True
2538     True
2539     True
2540     True
2541     True
2542     True
2543     True
Name: state, Length: 2544, dtype: bool

In [17]:
#2.可以将state列中空值对应的行数据获取
abb_pop.loc[abb_pop['state'].isnull()]

Unnamed: 0,state,state/region,ages,year,population
2448,,PR,under18,1990,
2449,,PR,total,1990,
2450,,PR,total,1991,
2451,,PR,under18,1991,
2452,,PR,total,1993,
2453,,PR,under18,1993,
2454,,PR,under18,1992,
2455,,PR,total,1992,
2456,,PR,under18,1994,
2457,,PR,total,1994,


In [33]:
#3.将行数据中的简称数据抽取出来，对其进行去重操作即可
abb_pop.loc[abb_pop['state'].isnull()]['state/region'].unique()

array([], dtype=object)

In [23]:
#为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
#先填充PA对应的空值
#1.找到PR简称对应的行数据
abb_pop.loc[abb_pop['state/region'] == 'PR']

Unnamed: 0,state,state/region,ages,year,population
2448,,PR,under18,1990,
2449,,PR,total,1990,
2450,,PR,total,1991,
2451,,PR,under18,1991,
2452,,PR,total,1993,
2453,,PR,under18,1993,
2454,,PR,under18,1992,
2455,,PR,total,1992,
2456,,PR,under18,1994,
2457,,PR,total,1994,


In [24]:
#2.获取步骤1返回的表格中的行索引
indexs = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
indexs

Int64Index([2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458,
            2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469,
            2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480,
            2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491,
            2492, 2493, 2494, 2495],
           dtype='int64')

In [27]:
#3.将indexs这些行的state中的空值填充成PR的全称
abb_pop.loc[indexs,'state'] = 'PPPRRR'

In [30]:
#填充USA对应的空值
#1.找到PR简称对应的行数据
abb_pop.loc[abb_pop['state/region'] == 'USA']

Unnamed: 0,state,state/region,ages,year,population
2496,,USA,under18,1990,64218512.0
2497,,USA,total,1990,249622814.0
2498,,USA,total,1991,252980942.0
2499,,USA,under18,1991,65313018.0
2500,,USA,under18,1992,66509177.0
2501,,USA,total,1992,256514231.0
2502,,USA,total,1993,259918595.0
2503,,USA,under18,1993,67594938.0
2504,,USA,under18,1994,68640936.0
2505,,USA,total,1994,263125826.0


In [31]:
#2.获取行索引
indexs = abb_pop.loc[abb_pop['state/region'] == 'USA'].index

In [32]:
#3.填充
abb_pop.loc[indexs,'state'] = 'United States'

In [35]:
abb_pop_area = pd.merge(left=abb_pop,right=area,on='state',how='outer')

In [36]:
abb_pop_area.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
0,Alabama,AL,under18,2012.0,1117489.0,52423.0
1,Alabama,AL,total,2012.0,4817528.0,52423.0
2,Alabama,AL,under18,2010.0,1130966.0,52423.0
3,Alabama,AL,total,2010.0,4785570.0,52423.0
4,Alabama,AL,under18,2011.0,1125763.0,52423.0


In [41]:
#我们会发现area(sq.mi)这一列有缺失数据，找出是哪些行
drop_indexs = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index

In [40]:
abb_pop_area.drop(labels=drop_indexs,axis=0,inplace=True)

In [45]:
#找出2010年的全民人口数据
abb_pop_area.query('ages == "total"&year==2010')

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
3,Alabama,AL,total,2010.0,4785570.0,52423.0
91,Alaska,AK,total,2010.0,713868.0,656425.0
101,Arizona,AZ,total,2010.0,6408790.0,114006.0
189,Arkansas,AR,total,2010.0,2922280.0,53182.0
197,California,CA,total,2010.0,37333601.0,163707.0
283,Colorado,CO,total,2010.0,5048196.0,104100.0
293,Connecticut,CT,total,2010.0,3579210.0,5544.0
379,Delaware,DE,total,2010.0,899711.0,1954.0
389,District of Columbia,DC,total,2010.0,605125.0,68.0
475,Florida,FL,total,2010.0,18846054.0,65758.0


In [48]:
#计算各州的人口密度
abb_pop_area['midu'] = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']
abb_pop_area.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi),midu
0,Alabama,AL,under18,2012.0,1117489.0,52423.0,21.316769
1,Alabama,AL,total,2012.0,4817528.0,52423.0,91.897221
2,Alabama,AL,under18,2010.0,1130966.0,52423.0,21.573851
3,Alabama,AL,total,2010.0,4785570.0,52423.0,91.287603
4,Alabama,AL,under18,2011.0,1125763.0,52423.0,21.474601


In [51]:
#找出密度最高的州
abb_pop_area.sort_values(by='midu',axis=0,ascending=False)

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi),midu
391,District of Columbia,DC,total,2013.0,646449.0,68.0,9506.602941
385,District of Columbia,DC,total,2012.0,633427.0,68.0,9315.102941
387,District of Columbia,DC,total,2011.0,619624.0,68.0,9112.117647
431,District of Columbia,DC,total,1990.0,605321.0,68.0,8901.779412
389,District of Columbia,DC,total,2010.0,605125.0,68.0,8898.897059
426,District of Columbia,DC,total,1991.0,600870.0,68.0,8836.323529
429,District of Columbia,DC,total,1992.0,597567.0,68.0,8787.750000
422,District of Columbia,DC,total,1993.0,595302.0,68.0,8754.441176
392,District of Columbia,DC,total,2009.0,592228.0,68.0,8709.235294
425,District of Columbia,DC,total,1994.0,589240.0,68.0,8665.294118
