### 主要数据结构
1D:Series,2D:DataFrame

In [50]:
import pandas as pd
import numpy as np
#Series，类比np.array。区别：数据+索引
#使用字典创建Series
sdata = {'a':1,'b':2,'c':3}
obj = pd.Series(sdata)
obj

a    1
b    2
c    3
dtype: int64

In [12]:
#在算术预算中，自动补齐不同索引的数据
obj2 = pd.Series({'b':3,'a':6})
obj+obj2

a    7.0
b    5.0
c    NaN
dtype: float64

In [14]:
#索引
obj.index

Index(['a', 'b', 'c'], dtype='object')

In [18]:
#Series与index的name属性
obj.name = 'test'
obj.index.name = 'Letter'
obj

Letter
a    1
b    2
c    3
Name: test, dtype: int64

### DataFrame
表格型数据结构，含有一组有序的列，每列的值类型可不相同。  
既有行索引也有列索引，  
可视为由Series组成的字典。

In [21]:
frame = pd.DataFrame({'state':['ohin','ohin','ohin','Nevada'],
                      'year':[2000,2001,2002,2003],
                      'pop':[1.5,1.7,3.6,2.4]})
frame

Unnamed: 0,pop,state,year
0,1.5,ohin,2000
1,1.7,ohin,2001
2,3.6,ohin,2002
3,2.4,Nevada,2003


In [33]:
#columns指定字段顺序
frame = pd.DataFrame(frame,columns=['year','state','pop'])
frame.index = ['a','b','c','d'] #指定index
frame

Unnamed: 0,year,state,pop
a,2000,,1.5
b,2001,,1.7
c,2002,,3.6
d,2003,,2.4


In [34]:
#dataframe的一个字段为一个Series
frame['year']

a    2000
b    2001
c    2002
d    2003
Name: year, dtype: int64

In [35]:
#为不存在的列赋值即可创建新列，del用于删除列
del frame['state']
frame['debt'] = [1,2,3,4]
frame

Unnamed: 0,year,pop,debt
a,2000,1.5,1
b,2001,1.7,2
c,2002,3.6,3
d,2003,2.4,4


In [39]:
#嵌套字典，外层键值作为字段名，内层键值作为行索引
pop = {'Nevada':{2001:2.4,2002:2.9}
      ,'Ohin':{2000:1.5,2001:1.7,2002:3.6}}
frame2 = pd.DataFrame(pop)
frame2

Unnamed: 0,Nevada,Ohin
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [40]:
frame2.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohin,1.5,1.7,3.6


### 索引对象
index是不可修改的
显性设置的index可重复

In [41]:
#drop,删除指定轴上的索引项
#索引、选取和过滤
frame

Unnamed: 0,year,pop,debt
a,2000,1.5,1
b,2001,1.7,2
c,2002,3.6,3
d,2003,2.4,4


In [43]:
Series = frame['debt']
Series

a    1
b    2
c    3
d    4
Name: debt, dtype: int64

In [45]:
Series[2:4]

c    3
d    4
Name: debt, dtype: int64

In [48]:
Series['b':'d'] #使用索引标签的切片运算，末端是包含的b~d

b    2
c    3
d    4
Name: debt, dtype: int64

### 算数运算和数据对齐

In [49]:
sdata = {'a':1,'b':2,'c':3}
obj = pd.Series(sdata)
obj2 = pd.Series({'b':3,'a':6})
#不重叠的索引处显示NA
obj+obj2

a    7.0
b    5.0
c    NaN
dtype: float64

In [82]:
#DataFrame的数据对齐会在行列同时发生
df1 = pd.DataFrame(np.arange(9).reshape((3,3)),columns = list('bcd'),index = ['Ohin','Texas','Colorado'])
df1

Unnamed: 0,b,c,d
Ohin,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [83]:
df2 = pd.DataFrame(np.arange(12).reshape((4,3)),columns = list('bde'),index = ['Utah','Ohin','Texas','Oregon'])
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohin,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [88]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohin,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [102]:
#算术方法中填充值add/sub/div/mul：+-*/
df1.add(df2).fillna(0)

Unnamed: 0,b,c,d,e
Colorado,0.0,0.0,0.0,0.0
Ohin,3.0,0.0,6.0,0.0
Oregon,0.0,0.0,0.0,0.0
Texas,9.0,0.0,12.0,0.0
Utah,0.0,0.0,0.0,0.0


### DataFrame与Series计算
广播

In [103]:
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohin,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [107]:
sr = df2.ix[0]
sr

b    0
d    1
e    2
Name: Utah, dtype: int64

In [109]:
df2-sr #sr沿行广播，依次与每行进行计算

Unnamed: 0,b,d,e
Utah,0,0,0
Ohin,3,3,3
Texas,6,6,6
Oregon,9,9,9


### 函数应用和映射
函数的元素级应用
apply()，将函数应用到行列上

In [114]:
np.sqrt(df2)

Unnamed: 0,b,d,e
Utah,0.0,1.0,1.414214
Ohin,1.732051,2.0,2.236068
Texas,2.44949,2.645751,2.828427
Oregon,3.0,3.162278,3.316625


In [116]:
f = lambda x: x.max() - x.min()
df2.apply(f)

b    9
d    9
e    9
dtype: int64

In [118]:
df2.apply(f,axis = 1)

Utah      2
Ohin      2
Texas     2
Oregon    2
dtype: int64

In [119]:
#DataFrame:applymap,Series:map使用元素级函数
Format = lambda x:'%.2f' %x
df2.applymap(Format)

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohin,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [121]:
obj.map(Format)

a    1.00
b    2.00
c    3.00
dtype: object

### 排序和排名

In [124]:
#sort_index/sort_values
obj = pd.Series(range(4),index = list('bcda'))
obj

b    0
c    1
d    2
a    3
dtype: int64

In [129]:
obj.sort_index()

a    3
b    0
c    1
d    2
dtype: int64

In [134]:
obj = pd.Series([4,6,1,4],index = list('abcd'))
obj.sort_values() #类比sql order by

c    1
a    4
d    4
b    6
dtype: int64

In [140]:
#rand()排名值
obj = pd.Series([7,12,7,3,6,8])
obj.rank() #默认给相同的值分配平均排名，还有max、min、first等方式
obj.rank(method = 'first')
obj.rank(ascending = False) #降序

0    3.5
1    1.0
2    3.5
3    6.0
4    5.0
5    2.0
dtype: float64

### 汇总和描述统计
各类汇总和描述统计函数

In [141]:
df2.describe()

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,4.5,5.5,6.5
std,3.872983,3.872983,3.872983
min,0.0,1.0,2.0
25%,2.25,3.25,4.25
50%,4.5,5.5,6.5
75%,6.75,7.75,8.75
max,9.0,10.0,11.0


## Chap6 数据加载、存储与文件格式
重点掌握csv读写、数据库连接等，根据需要进行学习