## 美国各州人口数据分析

In [74]:
# unique() 去重函数
s = Series(['Tom','Lucy','Tom','dancer','Lucy'])

In [75]:
s.unique()

array(['Tom', 'Lucy', 'dancer'], dtype=object)

In [77]:
# ndarray也有去重函数
n = np.array([1, 2, 1, 3, 2, 4, 5])
np.unique(n)

array([1, 2, 3, 4, 5])

In [78]:
n = DataFrame({'name':['Tom','Lucy','Tom','dancer','Lucy'],'age':[12,13,12,11,15]})
n

Unnamed: 0,name,age
0,Tom,12
1,Lucy,13
2,Tom,12
3,dancer,11
4,Lucy,15


In [79]:
n.dtypes

name    object
age      int64
dtype: object

In [81]:
n.query("name=='Lucy' & age > 14")

Unnamed: 0,name,age
4,Lucy,15


In [1]:
# query 条件查询函数
n.query("name == 'Lucy' & age>14")  

NameError: name 'n' is not defined

首先导入文件，并查看数据样本

In [105]:
abb = pd.read_csv('./state-abbrevs.csv')
abb.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [106]:
pop = pd.read_csv('./state-population.csv')
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [107]:
areas = pd.read_csv('./state-areas.csv')
areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


合并pop与abbrevs两个DataFrame，分别依据state/region列和abbreviation列来合并。

为了保留所有信息，使用外合并。

In [109]:
abb_pop = abb.merge(pop, left_on='abbreviation', right_on='state/region', how='outer')

去除abbreviation的那一列（axis=1）

In [None]:
# 另一种写法:abb_pop.drop(labels='abbreviation', axis=1, inplace=True)

In [110]:
abb_pop.drop(columns='abbreviation', inplace=True)

In [111]:
abb_pop.head()

Unnamed: 0,state,state/region,ages,year,population
0,Alabama,AL,under18,2012,1117489.0
1,Alabama,AL,total,2012,4817528.0
2,Alabama,AL,under18,2010,1130966.0
3,Alabama,AL,total,2010,4785570.0
4,Alabama,AL,under18,2011,1125763.0


查看存在缺失数据的列。

使用.isnull().any()，只有某一列存在一个缺失数据，就会显示True。

In [114]:
abb_pop.isnull().any(axis=0)

state            True
state/region    False
ages            False
year            False
population       True
dtype: bool

In [112]:
abb_pop.isnull().any()

state            True
state/region    False
ages            False
year            False
population       True
dtype: bool

In [115]:
# info也可以查看缺失数据
abb_pop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state         2448 non-null   object 
 1   state/region  2544 non-null   object 
 2   ages          2544 non-null   object 
 3   year          2544 non-null   int64  
 4   population    2524 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB


查看缺失数据

In [133]:
# 重点来了. 
# dataframe中有一种条件写法. 可以去过滤dataframe.
cond = abb_pop.isnull().any(axis=1)
cond

0       False
1       False
2       False
3       False
4       False
        ...  
2539     True
2540     True
2541     True
2542     True
2543     True
Length: 2544, dtype: bool

In [134]:
# 根据条件去把缺失数据显示出来
temp = abb_pop.loc[cond].copy()
temp

Unnamed: 0,state,state/region,ages,year,population
2448,,PR,under18,1990,
2449,,PR,total,1990,
2450,,PR,total,1991,
2451,,PR,under18,1991,
2452,,PR,total,1993,
...,...,...,...,...,...
2539,,USA,total,2010,309326295.0
2540,,USA,under18,2011,73902222.0
2541,,USA,total,2011,311582564.0
2542,,USA,under18,2012,73708179.0


根据数据是否缺失情况显示数据，如果缺失为True，那么显示

找到有哪些state/region使得state的值为NaN，使用unique()查看非重复值

In [122]:
temp['state/region'].unique()

array(['PR', 'USA'], dtype=object)

为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN！

记住这样清除缺失数据NaN的方法！

In [135]:
# 缺失数据处理的基本思路: 1. 能删则删, 不能删则填. 2. 尽量填的合理. 
# 先把所有state/region=PR的数据找出来, 然后把state填成波多黎各的全称. 
cond = temp['state/region'] == 'PR'
cond

2448     True
2449     True
2450     True
2451     True
2452     True
        ...  
2539    False
2540    False
2541    False
2542    False
2543    False
Name: state/region, Length: 96, dtype: bool

In [128]:
# 链式索引
temp.loc[cond]['state'] = 'Puerto Rico'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [129]:
# 使用链式索引, 赋值没成功. 
# 链式索引, 是一个固有问题, 会引起无法预测的结果, unpredictable results.
temp.loc[cond]

Unnamed: 0,state,state/region,ages,year,population
2448,,PR,under18,1990,
2449,,PR,total,1990,
2450,,PR,total,1991,
2451,,PR,under18,1991,
2452,,PR,total,1993,
2453,,PR,under18,1993,
2454,,PR,under18,1992,
2455,,PR,total,1992,
2456,,PR,under18,1994,
2457,,PR,total,1994,


In [136]:
# 千万不要用链式索引, 应该使用官方推荐写法. 先对行操作, 然后再对列操作. 
temp.loc[cond, 'state'] = 'Puerto Rico'

In [137]:
# 赋值成功. 
temp.loc[cond]

Unnamed: 0,state,state/region,ages,year,population
2448,Puerto Rico,PR,under18,1990,
2449,Puerto Rico,PR,total,1990,
2450,Puerto Rico,PR,total,1991,
2451,Puerto Rico,PR,under18,1991,
2452,Puerto Rico,PR,total,1993,
2453,Puerto Rico,PR,under18,1993,
2454,Puerto Rico,PR,under18,1992,
2455,Puerto Rico,PR,total,1992,
2456,Puerto Rico,PR,under18,1994,
2457,Puerto Rico,PR,total,1994,


In [138]:
cond = temp['state/region'] == 'USA'
temp.loc[cond]

Unnamed: 0,state,state/region,ages,year,population
2496,,USA,under18,1990,64218512.0
2497,,USA,total,1990,249622814.0
2498,,USA,total,1991,252980942.0
2499,,USA,under18,1991,65313018.0
2500,,USA,under18,1992,66509177.0
2501,,USA,total,1992,256514231.0
2502,,USA,total,1993,259918595.0
2503,,USA,under18,1993,67594938.0
2504,,USA,under18,1994,68640936.0
2505,,USA,total,1994,263125826.0


In [139]:
temp.loc[cond, 'state'] = 'United State of America'
temp.loc[cond]

Unnamed: 0,state,state/region,ages,year,population
2496,United State of America,USA,under18,1990,64218512.0
2497,United State of America,USA,total,1990,249622814.0
2498,United State of America,USA,total,1991,252980942.0
2499,United State of America,USA,under18,1991,65313018.0
2500,United State of America,USA,under18,1992,66509177.0
2501,United State of America,USA,total,1992,256514231.0
2502,United State of America,USA,total,1993,259918595.0
2503,United State of America,USA,under18,1993,67594938.0
2504,United State of America,USA,under18,1994,68640936.0
2505,United State of America,USA,total,1994,263125826.0


In [144]:
cond = abb_pop.isnull().any(axis=1)
abb_pop.loc[cond] = temp

In [145]:
abb_pop.loc[cond]

Unnamed: 0,state,state/region,ages,year,population
2448,Puerto Rico,PR,under18,1990,
2449,Puerto Rico,PR,total,1990,
2450,Puerto Rico,PR,total,1991,
2451,Puerto Rico,PR,under18,1991,
2452,Puerto Rico,PR,total,1993,
...,...,...,...,...,...
2539,United State of America,USA,total,2010,309326295.0
2540,United State of America,USA,under18,2011,73902222.0
2541,United State of America,USA,total,2011,311582564.0
2542,United State of America,USA,under18,2012,73708179.0


合并各州面积数据areas，使用左合并。

思考一下为什么使用外合并？



In [146]:
abb_pop.head()

Unnamed: 0,state,state/region,ages,year,population
0,Alabama,AL,under18,2012,1117489.0
1,Alabama,AL,total,2012,4817528.0
2,Alabama,AL,under18,2010,1130966.0
3,Alabama,AL,total,2010,4785570.0
4,Alabama,AL,under18,2011,1125763.0


In [140]:
areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [147]:
abb_pop_area = abb_pop.merge(areas, how='outer')
abb_pop_area.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
0,Alabama,AL,under18,2012,1117489.0,52423.0
1,Alabama,AL,total,2012,4817528.0,52423.0
2,Alabama,AL,under18,2010,1130966.0,52423.0
3,Alabama,AL,total,2010,4785570.0,52423.0
4,Alabama,AL,under18,2011,1125763.0,52423.0


继续寻找存在缺失数据的列

In [149]:
abb_pop_area.isnull().any()

state            False
state/region     False
ages             False
year             False
population        True
area (sq. mi)     True
dtype: bool

我们会发现area(sq.mi)这一列有缺失数据，为了找出是哪一行，我们需要找出是哪个state没有数据

In [150]:
cond = abb_pop_area.isnull().any(axis=1)
abb_pop_area.loc[cond]

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
2448,Puerto Rico,PR,under18,1990,,3515.0
2449,Puerto Rico,PR,total,1990,,3515.0
2450,Puerto Rico,PR,total,1991,,3515.0
2451,Puerto Rico,PR,under18,1991,,3515.0
2452,Puerto Rico,PR,total,1993,,3515.0
...,...,...,...,...,...,...
2539,United State of America,USA,total,2010,309326295.0,
2540,United State of America,USA,under18,2011,73902222.0,
2541,United State of America,USA,total,2011,311582564.0,
2542,United State of America,USA,under18,2012,73708179.0,


In [152]:
abb_pop_area.loc[cond, 'state'].unique()

array(['Puerto Rico', 'United State of America'], dtype=object)

In [154]:
temp = abb_pop_area.loc[cond].copy()
temp

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
2448,Puerto Rico,PR,under18,1990,,3515.0
2449,Puerto Rico,PR,total,1990,,3515.0
2450,Puerto Rico,PR,total,1991,,3515.0
2451,Puerto Rico,PR,under18,1991,,3515.0
2452,Puerto Rico,PR,total,1993,,3515.0
...,...,...,...,...,...,...
2539,United State of America,USA,total,2010,309326295.0,
2540,United State of America,USA,under18,2011,73902222.0,
2541,United State of America,USA,total,2011,311582564.0,
2542,United State of America,USA,under18,2012,73708179.0,


In [156]:
# 缺了波多黎各1990到1999的人口数据, 但是我们要分析的是2010的人口数据, 所以缺失数据可以删除. 
temp.loc[temp['state'] == 'Puerto Rico']

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
2448,Puerto Rico,PR,under18,1990,,3515.0
2449,Puerto Rico,PR,total,1990,,3515.0
2450,Puerto Rico,PR,total,1991,,3515.0
2451,Puerto Rico,PR,under18,1991,,3515.0
2452,Puerto Rico,PR,total,1993,,3515.0
2453,Puerto Rico,PR,under18,1993,,3515.0
2454,Puerto Rico,PR,under18,1992,,3515.0
2455,Puerto Rico,PR,total,1992,,3515.0
2456,Puerto Rico,PR,under18,1994,,3515.0
2457,Puerto Rico,PR,total,1994,,3515.0


In [157]:
# 缺失了全美的面积数据, 但是我们要分析的是各州的人口密度, 所以也可以删
temp.loc[temp['state'] == 'United State of America']

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
2496,United State of America,USA,under18,1990,64218512.0,
2497,United State of America,USA,total,1990,249622814.0,
2498,United State of America,USA,total,1991,252980942.0,
2499,United State of America,USA,under18,1991,65313018.0,
2500,United State of America,USA,under18,1992,66509177.0,
2501,United State of America,USA,total,1992,256514231.0,
2502,United State of America,USA,total,1993,259918595.0,
2503,United State of America,USA,under18,1993,67594938.0,
2504,United State of America,USA,under18,1994,68640936.0,
2505,United State of America,USA,total,1994,263125826.0,


去除含有缺失数据的行

In [160]:
abb_pop_area.dropna(axis=0, inplace=True)

查看数据是否缺失

In [161]:
abb_pop_area.isnull().any()

state            False
state/region     False
ages             False
year             False
population       False
area (sq. mi)    False
dtype: bool

In [162]:
abb_pop_area.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
0,Alabama,AL,under18,2012,1117489.0,52423.0
1,Alabama,AL,total,2012,4817528.0,52423.0
2,Alabama,AL,under18,2010,1130966.0,52423.0
3,Alabama,AL,total,2010,4785570.0,52423.0
4,Alabama,AL,under18,2011,1125763.0,52423.0


In [164]:
abb_pop_area.dtypes

state             object
state/region      object
ages              object
year               int64
population       float64
area (sq. mi)    float64
dtype: object

找出2010年的全民人口数据,df.query(查询语句)

In [166]:
result = abb_pop_area.query('ages == "total" & year==2010')

对查询结果进行处理，以state列作为新的行索引:set_index

In [167]:
result.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
3,Alabama,AL,total,2010,4785570.0,52423.0
91,Alaska,AK,total,2010,713868.0,656425.0
101,Arizona,AZ,total,2010,6408790.0,114006.0
189,Arkansas,AR,total,2010,2922280.0,53182.0
197,California,CA,total,2010,37333601.0,163707.0


In [168]:
result.set_index(keys='state', inplace=True)
result.head()

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi)
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,AL,total,2010,4785570.0,52423.0
Alaska,AK,total,2010,713868.0,656425.0
Arizona,AZ,total,2010,6408790.0,114006.0
Arkansas,AR,total,2010,2922280.0,53182.0
California,CA,total,2010,37333601.0,163707.0


计算人口密度。注意是Series/Series，其结果还是一个Series。

In [170]:
# 人口密度 = 人口 / 面积
density = result['population'] / result['area (sq. mi)']

排序，并找出人口密度最高的五个州sort_values()

In [173]:
# 默认是从小到大排序. 
density.sort_values(inplace=True)

In [None]:
# 人口密度最高的: District of Columbia 8898

In [174]:
density.tail()

state
Connecticut              645.600649
Rhode Island             681.339159
New Jersey              1009.253268
Puerto Rico             1058.665149
District of Columbia    8898.897059
dtype: float64

找出人口密度最低的五个州

In [175]:
# Alaska 1.08
density.head()

state
Alaska           1.087509
Wyoming          5.768079
Montana          6.736171
North Dakota     9.537565
South Dakota    10.583512
dtype: float64