In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 创建对象

###  使用传递的值列表序列来创建序列, 让 pandas 创建默认整数索引

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### 使用传递的numpy数组创建数据帧,并使用日期索引和标记列:

In [4]:
dates = pd.date_range('20130101', periods=6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.425798,2.441623,0.408869,0.20219
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338
2013-01-03,0.270262,3.147888,2.126483,-0.466999
2013-01-04,0.215878,-0.369923,-0.237782,0.424608
2013-01-05,-1.554558,0.004914,0.33531,0.524749
2013-01-06,1.619542,0.269932,0.755817,-1.137117


### 使用传递的可转换序列的字典对象创建数据帧:

In [8]:
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130101'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
})

In [9]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-01,1.0,3,test,foo
1,1.0,2013-01-01,1.0,3,train,foo
2,1.0,2013-01-01,1.0,3,test,foo
3,1.0,2013-01-01,1.0,3,train,foo


### 所有明确类型:

In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### 输出某个子集:

In [11]:
df2.E

0     test
1    train
2     test
3    train
Name: E, dtype: category
Categories (2, object): [test, train]

# 查看数据

### 查看帧顶部和底部的行:

In [12]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.425798,2.441623,0.408869,0.20219
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338
2013-01-03,0.270262,3.147888,2.126483,-0.466999
2013-01-04,0.215878,-0.369923,-0.237782,0.424608
2013-01-05,-1.554558,0.004914,0.33531,0.524749


In [13]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.215878,-0.369923,-0.237782,0.424608
2013-01-05,-1.554558,0.004914,0.33531,0.524749
2013-01-06,1.619542,0.269932,0.755817,-1.137117


### 显示索引, 列, 和底层的 numpy 数据:

In [14]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [16]:
df.values

array([[ 0.42579831,  2.4416229 ,  0.40886855,  0.20218995],
       [-0.19278402, -1.28585042, -1.3621569 ,  0.83833834],
       [ 0.27026165,  3.14788808,  2.1264831 , -0.46699882],
       [ 0.21587769, -0.36992343, -0.23778167,  0.42460784],
       [-1.55455815,  0.00491395,  0.33531015,  0.5247493 ],
       [ 1.61954158,  0.26993203,  0.75581745, -1.13711731]])

### 描述显示数据快速统计摘要:

In [17]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.13069,0.701431,0.337757,0.064295
std,1.027152,1.719409,1.148941,0.732894
min,-1.554558,-1.28585,-1.362157,-1.137117
25%,-0.090619,-0.276214,-0.094509,-0.299702
50%,0.24307,0.137423,0.372089,0.313399
75%,0.386914,1.8987,0.66908,0.499714
max,1.619542,3.147888,2.126483,0.838338


### 转置数据:

In [18]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.425798,-0.192784,0.270262,0.215878,-1.554558,1.619542
B,2.441623,-1.28585,3.147888,-0.369923,0.004914,0.269932
C,0.408869,-1.362157,2.126483,-0.237782,0.33531,0.755817
D,0.20219,0.838338,-0.466999,0.424608,0.524749,-1.137117


### 按轴排序:

In [19]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.20219,0.408869,2.441623,0.425798
2013-01-02,0.838338,-1.362157,-1.28585,-0.192784
2013-01-03,-0.466999,2.126483,3.147888,0.270262
2013-01-04,0.424608,-0.237782,-0.369923,0.215878
2013-01-05,0.524749,0.33531,0.004914,-1.554558
2013-01-06,-1.137117,0.755817,0.269932,1.619542


### 按值排序:

In [20]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338
2013-01-04,0.215878,-0.369923,-0.237782,0.424608
2013-01-05,-1.554558,0.004914,0.33531,0.524749
2013-01-06,1.619542,0.269932,0.755817,-1.137117
2013-01-01,0.425798,2.441623,0.408869,0.20219
2013-01-03,0.270262,3.147888,2.126483,-0.466999


In [22]:
df.sort_values(by=['C', 'B'])

Unnamed: 0,A,B,C,D
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338
2013-01-04,0.215878,-0.369923,-0.237782,0.424608
2013-01-05,-1.554558,0.004914,0.33531,0.524749
2013-01-01,0.425798,2.441623,0.408869,0.20219
2013-01-06,1.619542,0.269932,0.755817,-1.137117
2013-01-03,0.270262,3.147888,2.126483,-0.466999


# 选择器

## 读取

### 选择单列, 这会产生一个序列, 等价于 df.A:

In [28]:
df['A']

2013-01-01    0.425798
2013-01-02   -0.192784
2013-01-03    0.270262
2013-01-04    0.215878
2013-01-05   -1.554558
2013-01-06    1.619542
Freq: D, Name: A, dtype: float64

### 使用 [] 选择行的片断:

In [29]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.425798,2.441623,0.408869,0.20219
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338
2013-01-03,0.270262,3.147888,2.126483,-0.466999


In [30]:
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338
2013-01-03,0.270262,3.147888,2.126483,-0.466999
2013-01-04,0.215878,-0.369923,-0.237782,0.424608


## 使用标签选择

### 使用标签获取横截面:

In [40]:
df.loc[dates[0]]

A    0.425798
B    2.441623
C    0.408869
D    0.202190
Name: 2013-01-01 00:00:00, dtype: float64

### 使用标签选择多轴:

In [44]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.425798,2.441623
2013-01-02,-0.192784,-1.28585
2013-01-03,0.270262,3.147888
2013-01-04,0.215878,-0.369923
2013-01-05,-1.554558,0.004914
2013-01-06,1.619542,0.269932


### 显示标签切片, 包含两个端点:

In [52]:
df.loc['20130102': '20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.192784,-1.28585
2013-01-03,0.270262,3.147888
2013-01-04,0.215878,-0.369923


### 降低返回对象维度:

In [53]:
df.loc['20130102', ['A', 'B']]

A   -0.192784
B   -1.285850
Name: 2013-01-02 00:00:00, dtype: float64

### 获取标量值:

In [60]:
df.loc[dates[0], 'A']

0.42579830642996708

### 快速访问并获取标量数据(等价上面的方法):

In [61]:
df.at[dates[0], 'A']

0.42579830642996708

## 按位置选择

### 传递整数选择位置

In [62]:
df.iloc[3]

A    0.215878
B   -0.369923
C   -0.237782
D    0.424608
Name: 2013-01-04 00:00:00, dtype: float64

### 使用整数片断, 效果类似 numpy/python:

In [63]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.215878,-0.369923
2013-01-05,-1.554558,0.004914


### 使用整数偏移定位列表, 效果类似 numpy/python 样式:

In [64]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.192784,-1.362157
2013-01-03,0.270262,2.126483
2013-01-05,-1.554558,0.33531


### 显示行切片:

In [74]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338
2013-01-03,0.270262,3.147888,2.126483,-0.466999


### 显示列切片:

In [75]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,2.441623,0.408869
2013-01-02,-1.28585,-1.362157
2013-01-03,3.147888,2.126483
2013-01-04,-0.369923,-0.237782
2013-01-05,0.004914,0.33531
2013-01-06,0.269932,0.755817


### 显式获取一个值:

In [76]:
df.iloc[1, 1]

-1.2858504228420575

### 快速访问一个标量(等同上个方法):

In [77]:
df.iat[1, 1]

-1.2858504228420575

# 布尔索引

### 使用单个列的值选择数据:

In [78]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.425798,2.441623,0.408869,0.20219
2013-01-03,0.270262,3.147888,2.126483,-0.466999
2013-01-04,0.215878,-0.369923,-0.237782,0.424608
2013-01-06,1.619542,0.269932,0.755817,-1.137117


### where 操作:

In [80]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.425798,2.441623,0.408869,0.20219
2013-01-02,,,,0.838338
2013-01-03,0.270262,3.147888,2.126483,
2013-01-04,0.215878,,,0.424608
2013-01-05,,0.004914,0.33531,0.524749
2013-01-06,1.619542,0.269932,0.755817,


### 使用 isin() 筛选:

In [83]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.425798,2.441623,0.408869,0.20219,one
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338,one
2013-01-03,0.270262,3.147888,2.126483,-0.466999,two
2013-01-04,0.215878,-0.369923,-0.237782,0.424608,three
2013-01-05,-1.554558,0.004914,0.33531,0.524749,four
2013-01-06,1.619542,0.269932,0.755817,-1.137117,three


In [86]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.270262,3.147888,2.126483,-0.466999,two
2013-01-05,-1.554558,0.004914,0.33531,0.524749,four


# 赋值

### 赋值一个新列, 通过索引自动对齐数据:

In [91]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [92]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.425798,2.441623,0.408869,0.20219,
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338,1.0
2013-01-03,0.270262,3.147888,2.126483,-0.466999,2.0
2013-01-04,0.215878,-0.369923,-0.237782,0.424608,3.0
2013-01-05,-1.554558,0.004914,0.33531,0.524749,4.0
2013-01-06,1.619542,0.269932,0.755817,-1.137117,5.0


### 按标签赋值:

In [93]:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,2.441623,0.408869,0.20219,
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338,1.0
2013-01-03,0.270262,3.147888,2.126483,-0.466999,2.0
2013-01-04,0.215878,-0.369923,-0.237782,0.424608,3.0
2013-01-05,-1.554558,0.004914,0.33531,0.524749,4.0
2013-01-06,1.619542,0.269932,0.755817,-1.137117,5.0


### 按位置赋值:

In [94]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.408869,0.20219,
2013-01-02,-0.192784,-1.28585,-1.362157,0.838338,1.0
2013-01-03,0.270262,3.147888,2.126483,-0.466999,2.0
2013-01-04,0.215878,-0.369923,-0.237782,0.424608,3.0
2013-01-05,-1.554558,0.004914,0.33531,0.524749,4.0
2013-01-06,1.619542,0.269932,0.755817,-1.137117,5.0


### 通过 numpy 数组分配赋值:

In [97]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.408869,5,
2013-01-02,-0.192784,-1.28585,-1.362157,5,1.0
2013-01-03,0.270262,3.147888,2.126483,5,2.0
2013-01-04,0.215878,-0.369923,-0.237782,5,3.0
2013-01-05,-1.554558,0.004914,0.33531,5,4.0
2013-01-06,1.619542,0.269932,0.755817,5,5.0


### where 操作赋值:

In [100]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.408869,-5,
2013-01-02,-0.192784,-1.28585,-1.362157,-5,-1.0
2013-01-03,-0.270262,-3.147888,-2.126483,-5,-2.0
2013-01-04,-0.215878,-0.369923,-0.237782,-5,-3.0
2013-01-05,-1.554558,-0.004914,-0.33531,-5,-4.0
2013-01-06,-1.619542,-0.269932,-0.755817,-5,-5.0
