# Pandas中DataFrame的介绍与基本操作
* Pandas是开源的数据挖掘库

* Pandas是基于Numpy和Matplotlib开发

* DataFrame是Pandas中一种数据结构，它是带标签且大小可变的二维数据表格

In [2]:
# 导入pandas包
import pandas as pd
import numpy as np

In [3]:
# 产生数据10行5列的数据
stock_change = np.random.normal(0, 1, (10, 5))
stock_change

array([[-0.31704255, -1.15932952, -1.16826258, -2.57790439, -0.19786761],
       [ 0.7387701 ,  0.47090612, -0.06418984, -0.26979601, -2.01103445],
       [-1.30043779, -0.19111437,  1.23576029,  0.03277106, -1.32453329],
       [ 0.14574098,  1.02510756, -0.11192138, -0.64987771, -0.15524685],
       [ 0.89060777, -0.01753602, -1.13971543,  2.44601046,  1.64775541],
       [ 0.06734635, -0.30785511,  1.15923699,  0.16755081,  0.32608525],
       [ 0.40363543, -0.61311466,  1.27577844, -0.65187168,  0.28155264],
       [-0.41908279,  0.65215782,  0.69739269,  0.88647404,  1.18860653],
       [ 1.06257958, -0.26638774,  0.74607132, -0.12967171, -1.27022959],
       [-0.06295048,  0.47178126,  0.15648106,  1.14574984, -0.14065407]])

In [4]:
# 创建dataframe类型的数据，传入参数只能是ndarray类型，如果是dataframe，数据就会变成NAN
stock_rise = pd.DataFrame(stock_change)
stock_rise

Unnamed: 0,0,1,2,3,4
0,-0.317043,-1.15933,-1.168263,-2.577904,-0.197868
1,0.73877,0.470906,-0.06419,-0.269796,-2.011034
2,-1.300438,-0.191114,1.23576,0.032771,-1.324533
3,0.145741,1.025108,-0.111921,-0.649878,-0.155247
4,0.890608,-0.017536,-1.139715,2.44601,1.647755
5,0.067346,-0.307855,1.159237,0.167551,0.326085
6,0.403635,-0.613115,1.275778,-0.651872,0.281553
7,-0.419083,0.652158,0.697393,0.886474,1.188607
8,1.06258,-0.266388,0.746071,-0.129672,-1.27023
9,-0.06295,0.471781,0.156481,1.14575,-0.140654


In [5]:
# 查看stock_rise的形状
stock_rise.shape

(10, 5)

## 为DataFrame数据添加行索引和列索引

In [6]:
# 行索引名称的创建
stock_code = ["股票{}".format(i+1) for i in range(stock_rise.shape[0])]
stock_code

['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10']

In [7]:
# 修改行索引，注意参数得是ndarray类型
pd.DataFrame(stock_change, index=stock_code)

Unnamed: 0,0,1,2,3,4
股票1,-0.317043,-1.15933,-1.168263,-2.577904,-0.197868
股票2,0.73877,0.470906,-0.06419,-0.269796,-2.011034
股票3,-1.300438,-0.191114,1.23576,0.032771,-1.324533
股票4,0.145741,1.025108,-0.111921,-0.649878,-0.155247
股票5,0.890608,-0.017536,-1.139715,2.44601,1.647755
股票6,0.067346,-0.307855,1.159237,0.167551,0.326085
股票7,0.403635,-0.613115,1.275778,-0.651872,0.281553
股票8,-0.419083,0.652158,0.697393,0.886474,1.188607
股票9,1.06258,-0.266388,0.746071,-0.129672,-1.27023
股票10,-0.06295,0.471781,0.156481,1.14575,-0.140654


In [8]:
# 列索引（时间）的创建，可以利用pd中的date_range
# periods是持续日期，freq='B'是工作日
date = pd.date_range(start='20211021', periods=stock_rise.shape[1], freq='B')
date

DatetimeIndex(['2021-10-21', '2021-10-22', '2021-10-25', '2021-10-26',
               '2021-10-27'],
              dtype='datetime64[ns]', freq='B')

In [9]:
# 修改列索引，注意参数得是ndarray类型
data = pd.DataFrame(stock_change, index=stock_code, columns=date)
data

Unnamed: 0,2021-10-21,2021-10-22,2021-10-25,2021-10-26,2021-10-27
股票1,-0.317043,-1.15933,-1.168263,-2.577904,-0.197868
股票2,0.73877,0.470906,-0.06419,-0.269796,-2.011034
股票3,-1.300438,-0.191114,1.23576,0.032771,-1.324533
股票4,0.145741,1.025108,-0.111921,-0.649878,-0.155247
股票5,0.890608,-0.017536,-1.139715,2.44601,1.647755
股票6,0.067346,-0.307855,1.159237,0.167551,0.326085
股票7,0.403635,-0.613115,1.275778,-0.651872,0.281553
股票8,-0.419083,0.652158,0.697393,0.886474,1.188607
股票9,1.06258,-0.266388,0.746071,-0.129672,-1.27023
股票10,-0.06295,0.471781,0.156481,1.14575,-0.140654


## DataFrame的属性

In [10]:
# 形状
data.shape

(10, 5)

In [11]:
# 行索引和列索引
data.index

Index(['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10'], dtype='object')

In [12]:
data.columns

DatetimeIndex(['2021-10-21', '2021-10-22', '2021-10-25', '2021-10-26',
               '2021-10-27'],
              dtype='datetime64[ns]', freq='B')

In [13]:
# 数据表中的值
data.values

array([[-0.31704255, -1.15932952, -1.16826258, -2.57790439, -0.19786761],
       [ 0.7387701 ,  0.47090612, -0.06418984, -0.26979601, -2.01103445],
       [-1.30043779, -0.19111437,  1.23576029,  0.03277106, -1.32453329],
       [ 0.14574098,  1.02510756, -0.11192138, -0.64987771, -0.15524685],
       [ 0.89060777, -0.01753602, -1.13971543,  2.44601046,  1.64775541],
       [ 0.06734635, -0.30785511,  1.15923699,  0.16755081,  0.32608525],
       [ 0.40363543, -0.61311466,  1.27577844, -0.65187168,  0.28155264],
       [-0.41908279,  0.65215782,  0.69739269,  0.88647404,  1.18860653],
       [ 1.06257958, -0.26638774,  0.74607132, -0.12967171, -1.27022959],
       [-0.06295048,  0.47178126,  0.15648106,  1.14574984, -0.14065407]])

In [14]:
# 转置
data.T

Unnamed: 0,股票1,股票2,股票3,股票4,股票5,股票6,股票7,股票8,股票9,股票10
2021-10-21,-0.317043,0.73877,-1.300438,0.145741,0.890608,0.067346,0.403635,-0.419083,1.06258,-0.06295
2021-10-22,-1.15933,0.470906,-0.191114,1.025108,-0.017536,-0.307855,-0.613115,0.652158,-0.266388,0.471781
2021-10-25,-1.168263,-0.06419,1.23576,-0.111921,-1.139715,1.159237,1.275778,0.697393,0.746071,0.156481
2021-10-26,-2.577904,-0.269796,0.032771,-0.649878,2.44601,0.167551,-0.651872,0.886474,-0.129672,1.14575
2021-10-27,-0.197868,-2.011034,-1.324533,-0.155247,1.647755,0.326085,0.281553,1.188607,-1.27023,-0.140654


In [15]:
# 只显示前5行
data.head()

Unnamed: 0,2021-10-21,2021-10-22,2021-10-25,2021-10-26,2021-10-27
股票1,-0.317043,-1.15933,-1.168263,-2.577904,-0.197868
股票2,0.73877,0.470906,-0.06419,-0.269796,-2.011034
股票3,-1.300438,-0.191114,1.23576,0.032771,-1.324533
股票4,0.145741,1.025108,-0.111921,-0.649878,-0.155247
股票5,0.890608,-0.017536,-1.139715,2.44601,1.647755


In [16]:
# 只显示后五行
data.tail()

Unnamed: 0,2021-10-21,2021-10-22,2021-10-25,2021-10-26,2021-10-27
股票6,0.067346,-0.307855,1.159237,0.167551,0.326085
股票7,0.403635,-0.613115,1.275778,-0.651872,0.281553
股票8,-0.419083,0.652158,0.697393,0.886474,1.188607
股票9,1.06258,-0.266388,0.746071,-0.129672,-1.27023
股票10,-0.06295,0.471781,0.156481,1.14575,-0.140654


## 设置与重新设置索引
* 设置时，使用.index与.columns属性，同前面

* 设置新索引时，还可以使用set_index(keys, drop=True)

* 重新设置时，使用reset_index()

### 重新设置索引

In [22]:
data.reset_index()

Unnamed: 0,index,2021-10-21 00:00:00,2021-10-22 00:00:00,2021-10-25 00:00:00,2021-10-26 00:00:00,2021-10-27 00:00:00
0,股票1,-0.317043,-1.15933,-1.168263,-2.577904,-0.197868
1,股票2,0.73877,0.470906,-0.06419,-0.269796,-2.011034
2,股票3,-1.300438,-0.191114,1.23576,0.032771,-1.324533
3,股票4,0.145741,1.025108,-0.111921,-0.649878,-0.155247
4,股票5,0.890608,-0.017536,-1.139715,2.44601,1.647755
5,股票6,0.067346,-0.307855,1.159237,0.167551,0.326085
6,股票7,0.403635,-0.613115,1.275778,-0.651872,0.281553
7,股票8,-0.419083,0.652158,0.697393,0.886474,1.188607
8,股票9,1.06258,-0.266388,0.746071,-0.129672,-1.27023
9,股票10,-0.06295,0.471781,0.156481,1.14575,-0.140654


In [21]:
# 丢弃原标签
data.reset_index(drop=True)

Unnamed: 0,2021-10-21,2021-10-22,2021-10-25,2021-10-26,2021-10-27
0,-0.317043,-1.15933,-1.168263,-2.577904,-0.197868
1,0.73877,0.470906,-0.06419,-0.269796,-2.011034
2,-1.300438,-0.191114,1.23576,0.032771,-1.324533
3,0.145741,1.025108,-0.111921,-0.649878,-0.155247
4,0.890608,-0.017536,-1.139715,2.44601,1.647755
5,0.067346,-0.307855,1.159237,0.167551,0.326085
6,0.403635,-0.613115,1.275778,-0.651872,0.281553
7,-0.419083,0.652158,0.697393,0.886474,1.188607
8,1.06258,-0.266388,0.746071,-0.129672,-1.27023
9,-0.06295,0.471781,0.156481,1.14575,-0.140654


### 以某列设置新索引

In [26]:
# 先创建一个新的DataFrame
df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year' : [2012, 2014, 2016, 2018],
                   'sale' : [55, 40, 84, 31]})
df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2016,84
3,10,2018,31


In [27]:
# 将year这一列作为索引
df.set_index(keys=['year'])

Unnamed: 0_level_0,month,sale
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,1,55
2014,4,40
2016,7,84
2018,10,31


In [28]:
# 设置多个索引
# 这时候变成了三位数组，每个年份里面有多个月份，每个月份有多个sale
df.set_index(keys=['year', 'month'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2016,7,84
2018,10,31
