In [1]:
import numpy as np
import pandas as pd

In [2]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [4]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [5]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [6]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,403.428793,8103.083928,7.389056,403.428793
1,1096.633158,54.59815,20.085537,1096.633158
2,1096.633158,7.389056,148.413159,54.59815


# 索引对齐

In [7]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127}, name='population')

In [9]:
pop / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [21]:
type(area.index)

pandas.core.indexes.base.Index

In [26]:
a = pd.Series([2, 4, 6], index=[5, 2, 4])
b = pd.Series([1, 3, 5], index=[1, 2, 0])
a.index.intersection(b.index)

Index([2], dtype='int64')

In [28]:
a + b

0    NaN
1    NaN
2    7.0
4    NaN
5    NaN
dtype: float64

In [29]:
a.add(b, fill_value=0)

0    5.0
1    1.0
2    7.0
4    6.0
5    2.0
dtype: float64

In [25]:
ind1 = pd.Index([2, 4, 6])
ind2 = pd.Index([1, 2, 3, 5, 8])
# ind1 & ind2
ind1.union(ind2)

Index([1, 2, 3, 4, 5, 6, 8], dtype='int64')

In [27]:
pop.index.intersection(area.index)

Index(['California', 'Texas'], dtype='object')

In [35]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
A

Unnamed: 0,A,B
0,5,9
1,3,17


In [30]:
A + B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [31]:
A.add(B, fill_value=0)

Unnamed: 0,A,B,C
0,1.0,15.0,9.0
1,13.0,6.0,0.0
2,2.0,9.0,6.0


In [39]:
A.stack()

0  A     5
   B     9
1  A     3
   B    17
dtype: int64

In [40]:
A.stack().mean(), A.mean()

(8.5,
 A     4.0
 B    13.0
 dtype: float64)

In [41]:
A.add(B, fill_value=A.stack().mean())


Unnamed: 0,A,B,C
0,6.0,18.0,17.5
1,10.0,20.0,14.5
2,15.5,16.5,12.5


In [42]:
A = pd.DataFrame(rng.randint(0, 10, (3, 3)))
A

Unnamed: 0,0,1,2
0,1,4,7
1,9,8,8
2,0,8,6


In [43]:
A - A[0]

Unnamed: 0,0,1,2
0,0,-5,7
1,8,-1,8
2,-1,-1,6


In [47]:
A = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('ACD'))
A - A.iloc[0]

Unnamed: 0,A,C,D
0,0,0,0
1,-3,2,2
2,-5,2,9


缺失值

In [54]:
series = np.array([1, None, 3.5, 7])
series

array([1, None, 3.5, 7], dtype=object)

In [55]:
series.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [56]:
series = np.array([1, np.nan, 3.5, 7])
series

array([1. , nan, 3.5, 7. ])

In [59]:
series.sum(), np.nansum(series)


(nan, 11.5)

处理缺失值

In [60]:
data = pd.Series([1, np.nan, 'hello', None])

In [61]:
data.isnull

<bound method Series.isnull of 0        1
1      NaN
2    hello
3     None
dtype: object>

In [62]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [65]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, [np.nan, 4,
                           5], 6]])
df

Unnamed: 0,0,1,2,3
0,1,,2,
1,2,3.0,"[nan, 4, 5]",6.0


In [64]:
df.dropna()

Unnamed: 0,0,1,2,3
1,2,3.0,"[nan, 4, 5]",6.0


In [66]:
df.dropna(axis=1)

Unnamed: 0,0,2
0,1,2
1,2,"[nan, 4, 5]"


In [69]:
df.dropna(axis='columns')

Unnamed: 0,0,2
0,1,2
1,2,"[nan, 4, 5]"


In [74]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2,3
0,1,,2,
1,2,3.0,"[nan, 4, 5]",6.0


In [75]:
df.dropna(axis='columns', how='any')

Unnamed: 0,0,2
0,1,2
1,2,"[nan, 4, 5]"


In [82]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2,3
1,2,3.0,"[nan, 4, 5]",6.0


填充缺失值

In [94]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [95]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [96]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [97]:
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

　多级索引Series

In [199]:
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000),
         ('Texas', 2010)]

populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
pop = pd.Series(populations, index=index, name='population')
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
Name: population, dtype: int64

In [200]:
pop['California', 2010]

37253956

In [201]:
pop[('California', 2010):('New York', 2010)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
Name: population, dtype: int64

In [202]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
Name: population, dtype: int64

In [203]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [204]:
pop = pop.reindex(index)
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: population, dtype: int64

In [191]:
pop[:, 2010]

KeyError: 2010

In [110]:
pop.unstack()

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [111]:
pop_df = pd.DataFrame({'total': pop, 'under18': [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [113]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [114]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [115]:
f_u18.unstack().unstack()

2000  California    0.273594
      New York      0.247010
      Texas         0.283251
2010  California    0.249211
      New York      0.222831
      Texas         0.273568
dtype: float64

In [116]:
data = {('California', 2000): 33871648, ('California', 2010): 37253956, ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561, ('New York', 2000): 18976457, ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [118]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])

df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.031018,0.259159
a,2,0.6069,0.82633
b,1,0.974579,0.378657
b,2,0.730747,0.365442


In [119]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [120]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [128]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]], names=['first', 'second'])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['first', 'second'])

In [126]:
Index = pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
                      labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

TypeError: MultiIndex.__new__() got an unexpected keyword argument 'labels'

In [127]:
pd.__version__

'2.0.3'

In [129]:
# 多级行列索引 
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], names=['subject', 'type'])

In [134]:
# 模拟数据 
data = np.round(np.random.randn(4, 6), 2)
data[:, ::2] *= 10
data += 37
data

array([[38.  , 36.78, 35.3 , 38.01, 26.3 , 38.75],
       [49.3 , 38.25, 31.1 , 37.21, 37.6 , 35.97],
       [40.3 , 37.13, 43.1 , 36.33, 39.3 , 36.72],
       [30.  , 36.44, 34.2 , 36.49, 38.8 , 36.68]])

In [135]:
# 创建DataFrame 
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,38.0,36.78,35.3,38.01,26.3,38.75
2013,2,49.3,38.25,31.1,37.21,37.6,35.97
2014,1,40.3,37.13,43.1,36.33,39.3,36.72
2014,2,30.0,36.44,34.2,36.49,38.8,36.68


In [136]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,35.3,38.01
2013,2,31.1,37.21
2014,1,43.1,36.33
2014,2,34.2,36.49


In [140]:
health_data.iloc[1]['Bob']

type
HR      49.30
Temp    38.25
Name: (2013, 2), dtype: float64

In [141]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [144]:
pop[['California', 'New York']]

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [147]:
pop[pop > 20000000]

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [148]:
pop['California']

2000    33871648
2010    37253956
dtype: int64

In [155]:
pop.iloc[:3]

California  2000    33871648
            2010    37253956
New York    2000    18976457
dtype: int64

In [156]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,38.0,36.78
2013,2,49.3,38.25


In [159]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        38.0
      2        49.3
2014  1        40.3
      2        30.0
Name: (Bob, HR), dtype: float64

In [166]:
health_data.loc[(:, 1), (:, 'HR')]

SyntaxError: invalid syntax (3061056539.py, line 1)

In [169]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,38.0,35.3,26.3
2014,1,40.3,43.1,39.3


In [170]:
health_data.loc[idx[:, 1], ('Bob', 'HR')]

year  visit
2013  1        38.0
2014  1        40.3
Name: (Bob, HR), dtype: float64

如果 MultiIndex 不是有序的索引 ， 那么大多数切片操作都会失败

In [184]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index, name='data')
data.index.names = ['char', 'int']
data

char  int
a     1      0.023693
      2      0.983483
c     1      0.882176
      2      0.102431
b     1      0.905059
      2      0.982095
Name: data, dtype: float64

In [177]:
data['a':'b']

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [178]:
data = data.sort_index()

In [179]:
data['a':'b']

char  int
a     1      0.091739
      2      0.376578
b     1      0.455924
      2      0.714985
Name: data, dtype: float64

In [205]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: population, dtype: int64

In [207]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [208]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [210]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [211]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


多级索引的数据累计方法

In [212]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,38.0,36.78,35.3,38.01,26.3,38.75
2013,2,49.3,38.25,31.1,37.21,37.6,35.97
2014,1,40.3,37.13,43.1,36.33,39.3,36.72
2014,2,30.0,36.44,34.2,36.49,38.8,36.68


In [218]:
data_mean = health_data.mean(axis='year')

ValueError: No axis named year for object type DataFrame

In [220]:
health_data.groupby('year').mean()

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,43.65,37.515,33.2,37.61,31.95,37.36
2014,35.15,36.785,38.65,36.41,39.05,36.7


In [221]:
health_data.groupby('visit').mean()

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,39.15,36.955,39.2,37.17,32.8,37.735
2,39.65,37.345,32.65,36.85,38.2,36.325


In [226]:
health_data.groupby(level=0, axis=1).mean()

Unnamed: 0_level_0,subject,Bob,Guido,Sue
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1,37.39,36.655,32.525
2013,2,43.775,34.155,36.785
2014,1,38.715,39.715,38.01
2014,2,33.22,35.345,37.74


In [231]:
[[str(c) + str(i) for c in 'ABC'] for i in range(3)]

[['A0', 'B0', 'C0'], ['A1', 'B1', 'C1'], ['A2', 'B2', 'C2']]

In [245]:
def make_df(cols, ind):
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [247]:
# DataFrame示例 
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [252]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])

ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])

pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [250]:
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [253]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1, df2)

    A   B
1  A1  B1
2  A2  B2     A   B
3  A3  B3
4  A4  B4


In [254]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,A.1,B.1
1,A1,B1,,
2,A2,B2,,
3,,,A3,B3
4,,,A4,B4


In [255]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [259]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])

In [260]:
pd.concat([df3, df4], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [262]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index  # 复制索引 print(x); 
pd.concat([x, y])

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [263]:
pd.concat([x, y], verify_integrity=True)

ValueError: Indexes have overlapping values: Index([0, 1], dtype='int64')

In [264]:
pd.concat([x, y], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [265]:
pd.concat([x, y], keys=['A', 'B'])

Unnamed: 0,Unnamed: 1,A,B
A,0,A0,B0
A,1,A1,B1
B,0,A2,B2
B,1,A3,B3


In [266]:
df5 = make_df('ABC', [1, 2])

df6 = make_df('BCD', [3, 4])
pd.concat([df5, df6])

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [268]:
pd.concat([df5, df6], join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [284]:
pd.merge(df5, df6, how='left')

Unnamed: 0,A,B,C,D
0,A1,B1,C1,
1,A2,B2,C2,


In [286]:
df1.append(df2)

AttributeError: 'DataFrame' object has no attribute 'append'

# merge

In [288]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
print(df1);
print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014


In [293]:
df3 = pd.merge(df1, df2)

In [294]:
pd.merge(df1, df2)

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [295]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3);
print(df4)

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve


In [296]:
pd.merge(df3, df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [297]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting', 'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux', 'spreadsheets', 'organization']})
print(df1);
print(df5);
print(pd.merge(df1, df5))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization
  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting  spreadsheets
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR  spreadsheets
7      Sue           HR  organization


In [298]:
pd.merge(df1, df2, on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [299]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})

In [300]:
pd.merge(df1, df3, on='employee')

KeyError: 'employee'

In [301]:
pd.merge(df1, df3, left_on='employee', right_on='name')

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,80000
2,Lisa,Engineering,Lisa,120000
3,Sue,HR,Sue,90000


In [303]:
pd.merge(df1, df3, left_on='employee', right_on='name').drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


In [304]:
df1a = df1.set_index('employee')

df2a = df2.set_index('employee')

print(df1a);
print(df2a)

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014


In [305]:
pd.merge(df1a, df2a, left_index=True, right_index=True)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [306]:
pd.merge(df1a, df2a)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [307]:
pd.merge(df1a, df2a, left_index=True)

MergeError: Must pass right_on or right_index=True

In [308]:
pd.merge(df1a, df2, left_index=True, right_on='employee')


Unnamed: 0,group,employee,hire_date
1,Accounting,Bob,2008
2,Engineering,Jake,2012
0,Engineering,Lisa,2004
3,HR,Sue,2014


In [316]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],

                    'food': ['fish', 'beans', 'bread']},

                   columns=['name', 'food'])

df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],

                    'drink': ['wine', 'beer']},

                   columns=['name', 'drink'])

print(df6);
print(df7);
pd.merge(df6, df7)

    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
     name drink
0    Mary  wine
1  Joseph  beer


Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [317]:
pd.merge(df6, df7, how='outer')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


In [318]:
pd.merge(df6, df7, how='left')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine


In [319]:
pd.merge(df6, df7, how='right')

Unnamed: 0,name,food,drink
0,Mary,bread,wine
1,Joseph,,beer


In [320]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],

                    'rank': [1, 2, 3, 4]})

df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],

                    'rank': [3, 1, 4, 2]})
print(df8);
print(df9);
print(pd.merge(df8, df9, on="name"))

   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4
   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2
   name  rank_x  rank_y
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2


In [321]:
pd.merge(df8, df9, on="name", suffixes=['_L', '_R'])

Unnamed: 0,name,rank_L,rank_R
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [324]:
%pwd

'/Users/kearney/CODE/Hands-On-Data-Preprocessing-in-Python/Python Data Science Handbook'

In [327]:
# 请使用下面的shell下载数据 
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv 
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 57935  100 57935    0     0  81918      0 --:--:-- --:--:-- --:--:-- 81829
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   835  100   835    0     0   1624      0 --:--:-- --:--:-- --:--:--  1627
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   872  100   872    0     0   1586      0 --:--:-- --:--:-- --:--:--  1585


In [328]:
pop = pd.read_csv('state-population.csv')

areas = pd.read_csv('state-areas.csv')

abbrevs = pd.read_csv('state-abbrevs.csv')

In [329]:
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [330]:
areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [331]:
abbrevs.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [336]:
merged = pd.merge(pop, abbrevs, how='outer', left_on='state/region', right_on='abbreviation')
merged.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL


In [337]:
merged = merged.drop('abbreviation', axis=1)
merged.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [338]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [341]:
merged[merged['population'].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,


In [363]:
merged[merged['state'].isnull()]['state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [365]:
merged.loc[merged['state/region']=='PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region']=='USA', 'state'] = 'United States'
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [367]:
final = pd.merge(merged, areas, on='state', how='left')
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [368]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [370]:
final['state'][final['area (sq. mi)'].isnull()].unique()

array(['United States'], dtype=object)

In [371]:
final.dropna(inplace=True)
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [372]:
final.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool

先选择 2000 年的各州人口以及总人 口数据。让我们用 query() 函数进行快速计算

In [373]:
data2010 = final.query("year == 2010 & ages == 'total'") 
data2010.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0


In [374]:
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']

In [375]:
density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [376]:
density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64