In [38]:
# Pandas中一共有三种数据结构，分别为：Series、DataFrame和MultiIndex（老版本中叫Panel ）。
# 其中Series是一维数据结构，DataFrame是二维的表格型数据结构，MultiIndex是三维的数据结构。
import numpy as np


In [39]:
# Series
# Series是一个类似于一维数组的数据结构，它能够保存任何类型的数据，比如整数、字符串、浮点数等，主要由一组数据和与之相关的索引两部分构成。

In [40]:
# Series的创建
# 导入pandas
import pandas as pd
# pd.Series(data=None, index=None, dtype=None)
# 参数：
# data：传入的数据，可以是ndarray、list等
# index：索引，必须是唯一的，且与数据的长度相等。如果没有传入索引参数，则默认会自动创建一个从0-N的整数索引。
# dtype：数据的类型


In [41]:

pd.Series(np.arange(9))


0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
dtype: int64

In [42]:
# 指定索引
pd.Series([6.7,5.6,3,10,2], index=[1,2,3,4,5])

1     6.7
2     5.6
3     3.0
4    10.0
5     2.0
dtype: float64

In [43]:
# 通过字典数据创建
color_count = pd.Series({'red':100, 'blue':200, 'green': 500, 'yellow':1000})
color_count

blue       200
green      500
red        100
yellow    1000
dtype: int64

In [44]:
# Series的属性
# 为了更方便地操作Series对象中的索引和数据，Series中提供了两个属性index和values
color_count.index

Index(['blue', 'green', 'red', 'yellow'], dtype='object')

In [45]:
color_count.values

array([ 200,  500,  100, 1000])

In [46]:
# 也可以使用索引来获取数据：
color_count[0]

200

In [47]:
color_count[1]

500

# DataFrame


In [48]:
# DataFrame创建
# DataFrame是一个类似于二维数组或表格(如excel)的对象，既有行索引，又有列索引
# 行索引，表明不同行，横向索引，叫index，0轴，axis=0
# 列索引，表名不同列，纵向索引，叫columns，1轴，axis=1
pd.DataFrame(np.random.randn(2,3))

Unnamed: 0,0,1,2
0,0.212926,-0.179991,0.630448
1,0.258385,0.200643,1.185083


In [49]:
score = np.random.randint(40, 100, (10, 5))
score

array([[72, 71, 75, 41, 94],
       [47, 49, 44, 86, 71],
       [52, 71, 98, 63, 65],
       [75, 66, 85, 75, 65],
       [99, 89, 53, 82, 46],
       [66, 66, 61, 73, 91],
       [57, 55, 45, 69, 72],
       [50, 46, 45, 97, 57],
       [73, 72, 78, 71, 49],
       [40, 67, 44, 99, 78]])

In [50]:
score_df = pd.DataFrame(score)
score_df

Unnamed: 0,0,1,2,3,4
0,72,71,75,41,94
1,47,49,44,86,71
2,52,71,98,63,65
3,75,66,85,75,65
4,99,89,53,82,46
5,66,66,61,73,91
6,57,55,45,69,72
7,50,46,45,97,57
8,73,72,78,71,49
9,40,67,44,99,78


In [51]:
subjects = ["语文", "数学", "英语", "政治", "体育"]

stu = ["同学"+ str(i) for i in range(score_df.shape[0])]
# 注意这里score是numpy
data = pd.DataFrame(score, columns=subjects, index=stu)
stu

['同学0', '同学1', '同学2', '同学3', '同学4', '同学5', '同学6', '同学7', '同学8', '同学9']

In [52]:
data

Unnamed: 0,语文,数学,英语,政治,体育
同学0,72,71,75,41,94
同学1,47,49,44,86,71
同学2,52,71,98,63,65
同学3,75,66,85,75,65
同学4,99,89,53,82,46
同学5,66,66,61,73,91
同学6,57,55,45,69,72
同学7,50,46,45,97,57
同学8,73,72,78,71,49
同学9,40,67,44,99,78


## DataFrame的属性

In [53]:
# shape：形状
data.shape

(10, 5)

In [54]:
# index
# DataFrame的行索引列表
data.index

Index(['同学0', '同学1', '同学2', '同学3', '同学4', '同学5', '同学6', '同学7', '同学8', '同学9'], dtype='object')

In [55]:
# columns
# DataFrame的列索引列表
data.columns

Index(['语文', '数学', '英语', '政治', '体育'], dtype='object')

In [56]:
# values
# 直接获取其中array的值
data.values

array([[72, 71, 75, 41, 94],
       [47, 49, 44, 86, 71],
       [52, 71, 98, 63, 65],
       [75, 66, 85, 75, 65],
       [99, 89, 53, 82, 46],
       [66, 66, 61, 73, 91],
       [57, 55, 45, 69, 72],
       [50, 46, 45, 97, 57],
       [73, 72, 78, 71, 49],
       [40, 67, 44, 99, 78]])

In [57]:
# T
# 转置

In [58]:
data.T

Unnamed: 0,同学0,同学1,同学2,同学3,同学4,同学5,同学6,同学7,同学8,同学9
语文,72,47,52,75,99,66,57,50,73,40
数学,71,49,71,66,89,66,55,46,72,67
英语,75,44,98,85,53,61,45,45,78,44
政治,41,86,63,75,82,73,69,97,71,99
体育,94,71,65,65,46,91,72,57,49,78


In [59]:
# head(5)：显示前5行内容
# 如果不补充参数，默认5行。填入参数N则显示前N行
data.head(5)

Unnamed: 0,语文,数学,英语,政治,体育
同学0,72,71,75,41,94
同学1,47,49,44,86,71
同学2,52,71,98,63,65
同学3,75,66,85,75,65
同学4,99,89,53,82,46


In [60]:
# tail(5):显示后5行内容
# 如果不补充参数，默认5行。填入参数N则显示后N行
data.tail(5)

Unnamed: 0,语文,数学,英语,政治,体育
同学5,66,66,61,73,91
同学6,57,55,45,69,72
同学7,50,46,45,97,57
同学8,73,72,78,71,49
同学9,40,67,44,99,78


## DataFrame索引值的设置

In [61]:
# 修改行列索引值
stu = ["同学_"+ str(i) for i in range(score_df.shape[0])]
# 必须全体修改，单个无效
data.index = stu
data

Unnamed: 0,语文,数学,英语,政治,体育
同学_0,72,71,75,41,94
同学_1,47,49,44,86,71
同学_2,52,71,98,63,65
同学_3,75,66,85,75,65
同学_4,99,89,53,82,46
同学_5,66,66,61,73,91
同学_6,57,55,45,69,72
同学_7,50,46,45,97,57
同学_8,73,72,78,71,49
同学_9,40,67,44,99,78


In [62]:
data.reset_index()

Unnamed: 0,index,语文,数学,英语,政治,体育
0,同学_0,72,71,75,41,94
1,同学_1,47,49,44,86,71
2,同学_2,52,71,98,63,65
3,同学_3,75,66,85,75,65
4,同学_4,99,89,53,82,46
5,同学_5,66,66,61,73,91
6,同学_6,57,55,45,69,72
7,同学_7,50,46,45,97,57
8,同学_8,73,72,78,71,49
9,同学_9,40,67,44,99,78


In [63]:
data.reset_index(drop=True)

Unnamed: 0,语文,数学,英语,政治,体育
0,72,71,75,41,94
1,47,49,44,86,71
2,52,71,98,63,65
3,75,66,85,75,65
4,99,89,53,82,46
5,66,66,61,73,91
6,57,55,45,69,72
7,50,46,45,97,57
8,73,72,78,71,49
9,40,67,44,99,78


In [64]:
# 以某列值设置为新的索引
# set_index(keys, drop=True)
# keys : 列索引名成或者列索引名称的列表
# drop : boolean, default True.当做新的索引，删除原来的列

In [65]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})
df

Unnamed: 0,month,sale,year
0,1,55,2012
1,4,40,2014
2,7,84,2013
3,10,31,2014


In [66]:
df.set_index("year")

Unnamed: 0_level_0,month,sale
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


In [67]:
df = df.set_index(["year", "month"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


# MultiIndex与Panel
## MultiIndex

In [68]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


In [70]:
df.index
# MultiIndex(levels=[[2012, 2013, 2014], [1, 4, 7, 10]], #标签值
        #    labels=[[0, 2, 1, 2], [0, 1, 2, 3]], #标签值排序
        #    names=['year', 'month']) #标签名

MultiIndex(levels=[[2012, 2013, 2014], [1, 4, 7, 10]],
           labels=[[0, 2, 1, 2], [0, 1, 2, 3]],
           names=['year', 'month'])

In [71]:
df.index.names

FrozenList(['year', 'month'])

In [72]:
df.index.levels

FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])

In [75]:
# 创建
arrays = [[1, 1, 2, 2], ["r", "b", "r","b"]]
a1 = pd.MultiIndex.from_arrays(arrays, names=("num", "col"))

In [76]:
a1

MultiIndex(levels=[[1, 2], ['b', 'r']],
           labels=[[0, 0, 1, 1], [1, 0, 1, 0]],
           names=['num', 'col'])

## panel

In [77]:
# panel的创建
# class pandas.Panel (data=None, items=None, major_axis=None, minor_axis=None)
# 作用：存储3维数组的Panel结构
# 参数：
# data : ndarray或者dataframe
# items : 索引或类似数组的对象，axis=0
# major_axis : 索引或类似数组的对象，axis=1
# minor_axis : 索引或类似数组的对象，axis=2
p = pd.Panel(data=np.arange(24).reshape(4,3,2),
                 items=list('ABCD'),
                 major_axis=pd.date_range('20130101', periods=3),
                 minor_axis=['first', 'second'])
p

<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 3 (major_axis) x 2 (minor_axis)
Items axis: A to D
Major_axis axis: 2013-01-01 00:00:00 to 2013-01-03 00:00:00
Minor_axis axis: first to second

In [78]:
# 查看panel数据
p[:, :, "second"] #：所有

Unnamed: 0,A,B,C,D
2013-01-01,1,7,13,19
2013-01-02,3,9,15,21
2013-01-03,5,11,17,23


In [79]:
p[:, :, "first"]

Unnamed: 0,A,B,C,D
2013-01-01,0,6,12,18
2013-01-02,2,8,14,20
2013-01-03,4,10,16,22


In [80]:
p["A", :, :]

Unnamed: 0,first,second
2013-01-01,0,1
2013-01-02,2,3
2013-01-03,4,5
