In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range("20011230", periods=6)

In [6]:
dates

DatetimeIndex(['2001-12-30', '2001-12-31', '2002-01-01', '2002-01-02',
               '2002-01-03', '2002-01-04'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

In [8]:
df

Unnamed: 0,A,B,C,D
2001-12-30,-0.753951,0.168511,0.533187,1.370839
2001-12-31,0.749287,-0.124941,-1.40281,1.516351
2002-01-01,0.304845,-0.567577,-0.600095,-2.338366
2002-01-02,0.082359,-0.107569,0.138668,0.31124
2002-01-03,-0.507269,1.1041,0.941182,-0.22368
2002-01-04,1.482971,0.059103,1.555629,-0.412982


In [9]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [10]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [11]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## 以下是查看框架顶行和底行的方法：

### 选择单列，产生a.series

In [12]:
df.head()

Unnamed: 0,A,B,C,D
2001-12-30,-0.753951,0.168511,0.533187,1.370839
2001-12-31,0.749287,-0.124941,-1.40281,1.516351
2002-01-01,0.304845,-0.567577,-0.600095,-2.338366
2002-01-02,0.082359,-0.107569,0.138668,0.31124
2002-01-03,-0.507269,1.1041,0.941182,-0.22368


In [13]:
df.tail(3)

Unnamed: 0,A,B,C,D
2002-01-02,0.082359,-0.107569,0.138668,0.31124
2002-01-03,-0.507269,1.1041,0.941182,-0.22368
2002-01-04,1.482971,0.059103,1.555629,-0.412982


### 显示索引、列：

In [14]:
df.index

DatetimeIndex(['2001-12-30', '2001-12-31', '2002-01-01', '2002-01-02',
               '2002-01-03', '2002-01-04'],
              dtype='datetime64[ns]', freq='D')

In [15]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

DataFrame.to_numpy() gives a NumPy representation of the underlying data. Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.

In [16]:
df.to_numpy()

array([[-0.75395082,  0.16851127,  0.53318689,  1.37083919],
       [ 0.74928713, -0.12494096, -1.40280962,  1.51635081],
       [ 0.30484475, -0.56757675, -0.60009452, -2.33836572],
       [ 0.08235914, -0.10756859,  0.13866838,  0.31123961],
       [-0.50726873,  1.10410043,  0.94118248, -0.22367999],
       [ 1.48297087,  0.05910321,  1.55562889, -0.41298161]])

For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() is relatively expensive.

In [17]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

DataFrame.to_numpy() does not include the index or column labels in the output.

### describe() 显示数据的快速统计摘要：

In [18]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.226374,0.088605,0.194294,0.037234
std,0.822027,0.557341,1.06872,1.411071
min,-0.753951,-0.567577,-1.40281,-2.338366
25%,-0.359862,-0.120598,-0.415404,-0.365656
50%,0.193602,-0.024233,0.335928,0.04378
75%,0.638177,0.141159,0.839184,1.105939
max,1.482971,1.1041,1.555629,1.516351


### 转置

In [19]:
df.T

Unnamed: 0,2001-12-30,2001-12-31,2002-01-01,2002-01-02,2002-01-03,2002-01-04
A,-0.753951,0.749287,0.304845,0.082359,-0.507269,1.482971
B,0.168511,-0.124941,-0.567577,-0.107569,1.1041,0.059103
C,0.533187,-1.40281,-0.600095,0.138668,0.941182,1.555629
D,1.370839,1.516351,-2.338366,0.31124,-0.22368,-0.412982


### 按轴排序

In [20]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2001-12-30,1.370839,0.533187,0.168511,-0.753951
2001-12-31,1.516351,-1.40281,-0.124941,0.749287
2002-01-01,-2.338366,-0.600095,-0.567577,0.304845
2002-01-02,0.31124,0.138668,-0.107569,0.082359
2002-01-03,-0.22368,0.941182,1.1041,-0.507269
2002-01-04,-0.412982,1.555629,0.059103,1.482971


### 按值排序

In [21]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2002-01-01,0.304845,-0.567577,-0.600095,-2.338366
2001-12-31,0.749287,-0.124941,-1.40281,1.516351
2002-01-02,0.082359,-0.107569,0.138668,0.31124
2002-01-04,1.482971,0.059103,1.555629,-0.412982
2001-12-30,-0.753951,0.168511,0.533187,1.370839
2002-01-03,-0.507269,1.1041,0.941182,-0.22368


## 获取
### 选择单列，产生a Series,相当于df.A

In [22]:
df["A"]

2001-12-30   -0.753951
2001-12-31    0.749287
2002-01-01    0.304845
2002-01-02    0.082359
2002-01-03   -0.507269
2002-01-04    1.482971
Freq: D, Name: A, dtype: float64

### 选择via[],进行切片

In [23]:
df[0:3]

Unnamed: 0,A,B,C,D
2001-12-30,-0.753951,0.168511,0.533187,1.370839
2001-12-31,0.749287,-0.124941,-1.40281,1.516351
2002-01-01,0.304845,-0.567577,-0.600095,-2.338366


In [24]:
df["20011230":"20020102"]

Unnamed: 0,A,B,C,D
2001-12-30,-0.753951,0.168511,0.533187,1.370839
2001-12-31,0.749287,-0.124941,-1.40281,1.516351
2002-01-01,0.304845,-0.567577,-0.600095,-2.338366
2002-01-02,0.082359,-0.107569,0.138668,0.31124


## 按标签选择

### 使用标签获取横截面 

In [25]:
df.loc[dates[0]]

A   -0.753951
B    0.168511
C    0.533187
D    1.370839
Name: 2001-12-30 00:00:00, dtype: float64