In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
s = pd.Series([1,3,5,np.nan,6,8,10,12])

In [4]:
s

0     1.0
1     3.0
2     5.0
3     NaN
4     6.0
5     8.0
6    10.0
7    12.0
dtype: float64

In [5]:
dates = pd.date_range("20011230", periods=8)

In [6]:
dates

DatetimeIndex(['2001-12-30', '2001-12-31', '2002-01-01', '2002-01-02',
               '2002-01-03', '2002-01-04', '2002-01-05', '2002-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(len(dates), 4), index=dates, columns=list("ABCD"))

In [8]:
df

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-02,-1.292372,-0.206607,0.454786,0.72567
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059
2002-01-05,0.204571,-0.853162,1.638049,0.901371
2002-01-06,-0.711878,0.923243,0.404657,-0.719991


In [9]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [10]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [11]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## 以下是查看框架顶行和底行的方法：

### 选择单列，产生a.series

In [12]:
df.head()

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-02,-1.292372,-0.206607,0.454786,0.72567
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672


In [13]:
df.tail(3)

Unnamed: 0,A,B,C,D
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059
2002-01-05,0.204571,-0.853162,1.638049,0.901371
2002-01-06,-0.711878,0.923243,0.404657,-0.719991


### 显示索引、列：

In [14]:
df.index

DatetimeIndex(['2001-12-30', '2001-12-31', '2002-01-01', '2002-01-02',
               '2002-01-03', '2002-01-04', '2002-01-05', '2002-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

DataFrame.to_numpy() gives a NumPy representation of the underlying data. Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.

In [16]:
df.to_numpy()

array([[ 1.44790503e+00, -1.58507184e+00, -5.96624826e-01,
        -1.20175125e-01],
       [ 2.15306365e+00, -1.25871026e-01,  1.26723751e+00,
        -1.30209713e+00],
       [-4.49006384e-01,  3.19571144e-01, -9.24780075e-02,
         5.78119515e-01],
       [-1.29237183e+00, -2.06606927e-01,  4.54785721e-01,
         7.25670369e-01],
       [-4.71602436e-01, -3.76749147e-01,  4.42158572e-01,
        -7.67199515e-01],
       [ 5.85144072e-02, -1.56633722e+00, -7.04555663e-01,
        -2.05861071e-03],
       [ 2.04570760e-01, -8.53162165e-01,  1.63804858e+00,
         9.01371063e-01],
       [-7.11878063e-01,  9.23243025e-01,  4.04657039e-01,
        -7.19990723e-01]])

For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() is relatively expensive.

In [17]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

DataFrame.to_numpy() does not include the index or column labels in the output.

### describe() 显示数据的快速统计摘要：

In [18]:
df.describe()

Unnamed: 0,A,B,C,D
count,8.0,8.0,8.0,8.0
mean,0.117399,-0.433873,0.351654,-0.088295
std,1.15066,0.873642,0.821741,0.795326
min,-1.292372,-1.585072,-0.704556,-1.302097
25%,-0.531671,-1.031456,-0.218515,-0.731793
50%,-0.195246,-0.291678,0.423408,-0.061117
75%,0.515404,-0.01451,0.657899,0.615007
max,2.153064,0.923243,1.638049,0.901371


### 转置

In [19]:
df.T

Unnamed: 0,2001-12-30,2001-12-31,2002-01-01,2002-01-02,2002-01-03,2002-01-04,2002-01-05,2002-01-06
A,1.447905,2.153064,-0.449006,-1.292372,-0.471602,0.058514,0.204571,-0.711878
B,-1.585072,-0.125871,0.319571,-0.206607,-0.376749,-1.566337,-0.853162,0.923243
C,-0.596625,1.267238,-0.092478,0.454786,0.442159,-0.704556,1.638049,0.404657
D,-0.120175,-1.302097,0.57812,0.72567,-0.7672,-0.002059,0.901371,-0.719991


### 按轴排序

In [20]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2001-12-30,-0.120175,-0.596625,-1.585072,1.447905
2001-12-31,-1.302097,1.267238,-0.125871,2.153064
2002-01-01,0.57812,-0.092478,0.319571,-0.449006
2002-01-02,0.72567,0.454786,-0.206607,-1.292372
2002-01-03,-0.7672,0.442159,-0.376749,-0.471602
2002-01-04,-0.002059,-0.704556,-1.566337,0.058514
2002-01-05,0.901371,1.638049,-0.853162,0.204571
2002-01-06,-0.719991,0.404657,0.923243,-0.711878


### 按值排序

In [21]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059
2002-01-05,0.204571,-0.853162,1.638049,0.901371
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672
2002-01-02,-1.292372,-0.206607,0.454786,0.72567
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-06,-0.711878,0.923243,0.404657,-0.719991


## 获取
### 选择单列，产生a Series,相当于df.A

In [22]:
df["A"]

2001-12-30    1.447905
2001-12-31    2.153064
2002-01-01   -0.449006
2002-01-02   -1.292372
2002-01-03   -0.471602
2002-01-04    0.058514
2002-01-05    0.204571
2002-01-06   -0.711878
Freq: D, Name: A, dtype: float64

### 选择via[],进行切片

In [23]:
df[0:3]

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-01,-0.449006,0.319571,-0.092478,0.57812


In [24]:
df["20011230":"20020102"]

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-02,-1.292372,-0.206607,0.454786,0.72567


## 按标签选择

### 使用标签获取横截面 

In [25]:
df.loc[dates[0]]

A    1.447905
B   -1.585072
C   -0.596625
D   -0.120175
Name: 2001-12-30 00:00:00, dtype: float64

### 按标签在多轴上选择

In [26]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2001-12-30,1.447905,-1.585072
2001-12-31,2.153064,-0.125871
2002-01-01,-0.449006,0.319571
2002-01-02,-1.292372,-0.206607
2002-01-03,-0.471602,-0.376749
2002-01-04,0.058514,-1.566337
2002-01-05,0.204571,-0.853162
2002-01-06,-0.711878,0.923243


### 包括两个端点，显示标签切片

In [27]:
df.loc['20011230':'20020103',['A','B']]

Unnamed: 0,A,B
2001-12-30,1.447905,-1.585072
2001-12-31,2.153064,-0.125871
2002-01-01,-0.449006,0.319571
2002-01-02,-1.292372,-0.206607
2002-01-03,-0.471602,-0.376749


### Reduction in the dimensions of the returned object:

In [28]:
df.loc["20011231",['A','B']]

A    2.153064
B   -0.125871
Name: 2001-12-31 00:00:00, dtype: float64

### 获取标量值

In [29]:
df.loc[dates[0],'A']

1.4479050322455906

In [30]:
df.at[dates[0],'A']

1.4479050322455906

## 按位置选择

### 通过传递的整数的位置选择

In [31]:
df.iloc[3]

A   -1.292372
B   -0.206607
C    0.454786
D    0.725670
Name: 2002-01-02 00:00:00, dtype: float64

## 通过整数切片

In [32]:
df.iloc[1:4,0:2]

Unnamed: 0,A,B
2001-12-31,2.153064,-0.125871
2002-01-01,-0.449006,0.319571
2002-01-02,-1.292372,-0.206607


### 通过列表打印输出，类似于 NumPy/Python 风格：

In [33]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2001-12-31,2.153064,1.267238
2002-01-01,-0.449006,-0.092478
2002-01-03,-0.471602,0.442159


### 显式切片

In [34]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2001-12-30,-1.585072,-0.596625
2001-12-31,-0.125871,1.267238
2002-01-01,0.319571,-0.092478
2002-01-02,-0.206607,0.454786
2002-01-03,-0.376749,0.442159
2002-01-04,-1.566337,-0.704556
2002-01-05,-0.853162,1.638049
2002-01-06,0.923243,0.404657


In [35]:
df.iloc[2:4,:]

Unnamed: 0,A,B,C,D
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-02,-1.292372,-0.206607,0.454786,0.72567


### 快速访问标量

In [36]:
df.iloc[2,3]

0.5781195148754029

In [37]:
df.iat[2,3]

0.5781195148754029

## 布尔索引

In [38]:
df

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-02,-1.292372,-0.206607,0.454786,0.72567
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059
2002-01-05,0.204571,-0.853162,1.638049,0.901371
2002-01-06,-0.711878,0.923243,0.404657,-0.719991


### 使用单个列的值来选择数据

In [39]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059
2002-01-05,0.204571,-0.853162,1.638049,0.901371


### 从满足布尔条件的 DataFrame 中选择值

In [40]:
df[df>0]

Unnamed: 0,A,B,C,D
2001-12-30,1.447905,,,
2001-12-31,2.153064,,1.267238,
2002-01-01,,0.319571,,0.57812
2002-01-02,,,0.454786,0.72567
2002-01-03,,,0.442159,
2002-01-04,0.058514,,,
2002-01-05,0.204571,,1.638049,0.901371
2002-01-06,,0.923243,0.404657,


### 使用isin()过滤方法

In [41]:
df2 = df.copy()

In [42]:
df2["E"]=["one", "one", "two", "three", "four", 
          "three","two","one"]

In [43]:
df2

Unnamed: 0,A,B,C,D,E
2001-12-30,1.447905,-1.585072,-0.596625,-0.120175,one
2001-12-31,2.153064,-0.125871,1.267238,-1.302097,one
2002-01-01,-0.449006,0.319571,-0.092478,0.57812,two
2002-01-02,-1.292372,-0.206607,0.454786,0.72567,three
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672,four
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059,three
2002-01-05,0.204571,-0.853162,1.638049,0.901371,two
2002-01-06,-0.711878,0.923243,0.404657,-0.719991,one


In [44]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2002-01-01,-0.449006,0.319571,-0.092478,0.57812,two
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672,four
2002-01-05,0.204571,-0.853162,1.638049,0.901371,two


## 设置

设置新列会自动按索引对齐数据

In [45]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130102", periods=6))

In [46]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

### 按标签设置值

In [47]:
df.at[dates[0],'A']=1.98765

In [48]:
df

Unnamed: 0,A,B,C,D
2001-12-30,1.98765,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,-1.302097
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-02,-1.292372,-0.206607,0.454786,0.72567
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059
2002-01-05,0.204571,-0.853162,1.638049,0.901371
2002-01-06,-0.711878,0.923243,0.404657,-0.719991


### 按位置设置值

In [49]:
df.iat[1,3]=1.23456

In [50]:
df

Unnamed: 0,A,B,C,D
2001-12-30,1.98765,-1.585072,-0.596625,-0.120175
2001-12-31,2.153064,-0.125871,1.267238,1.23456
2002-01-01,-0.449006,0.319571,-0.092478,0.57812
2002-01-02,-1.292372,-0.206607,0.454786,0.72567
2002-01-03,-0.471602,-0.376749,0.442159,-0.7672
2002-01-04,0.058514,-1.566337,-0.704556,-0.002059
2002-01-05,0.204571,-0.853162,1.638049,0.901371
2002-01-06,-0.711878,0.923243,0.404657,-0.719991


### 通过使用numpy数组赋值来设置

In [51]:
df.loc[:,'D']=np.array([5]*len(df))

In [52]:
df

Unnamed: 0,A,B,C,D
2001-12-30,1.98765,-1.585072,-0.596625,5
2001-12-31,2.153064,-0.125871,1.267238,5
2002-01-01,-0.449006,0.319571,-0.092478,5
2002-01-02,-1.292372,-0.206607,0.454786,5
2002-01-03,-0.471602,-0.376749,0.442159,5
2002-01-04,0.058514,-1.566337,-0.704556,5
2002-01-05,0.204571,-0.853162,1.638049,5
2002-01-06,-0.711878,0.923243,0.404657,5


### 把表中的正数变成相反数

In [53]:
df2 = df.copy()

In [54]:
df2[df2>0]=-df2

In [55]:
df2

Unnamed: 0,A,B,C,D
2001-12-30,-1.98765,-1.585072,-0.596625,-5
2001-12-31,-2.153064,-0.125871,-1.267238,-5
2002-01-01,-0.449006,-0.319571,-0.092478,-5
2002-01-02,-1.292372,-0.206607,-0.454786,-5
2002-01-03,-0.471602,-0.376749,-0.442159,-5
2002-01-04,-0.058514,-1.566337,-0.704556,-5
2002-01-05,-0.204571,-0.853162,-1.638049,-5
2002-01-06,-0.711878,-0.923243,-0.404657,-5


## 统计

### 执行描述性统计

In [56]:
df.mean()

A    0.184868
B   -0.433873
C    0.351654
D    5.000000
dtype: float64

另一个轴上的相同操作

In [57]:
df.mean(1)

2001-12-30    1.201488
2001-12-31    2.073608
2002-01-01    1.194522
2002-01-02    0.988952
2002-01-03    1.148452
2002-01-04    0.696905
2002-01-05    1.497364
2002-01-06    1.404006
Freq: D, dtype: float64

操作具有不同维度且需要对齐的对象。此外，pandas 会自动沿指定维度进行广播

In [58]:
s1 = pd.Series([1, 3, 5, np.nan, 6, 8,10,12], index=dates).shift(2)

In [59]:
s

0     1.0
1     3.0
2     5.0
3     NaN
4     6.0
5     8.0
6    10.0
7    12.0
dtype: float64

In [60]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D
2001-12-30 00:00:00,,,,
2001-12-31 00:00:00,,,,
2002-01-01 00:00:00,,,,
2002-01-02 00:00:00,,,,
2002-01-03 00:00:00,,,,
2002-01-04 00:00:00,,,,
2002-01-05 00:00:00,,,,
2002-01-06 00:00:00,,,,
0,,,,
1,,,,
