In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.set_option('max_columns', 30)
pd.set_option('max_rows', 20)

# Pandas Data Structures
## Series
A **Series** is a single vector of data (like a NumPy array) with an *index* that labels each element in the vector.

Series：一维数组，与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近，其区别是：List中的元素可以是不同的数据类型，而Array和Series中则只允许存储相同的数据类型，这样可以更有效的使用内存，提高运算效率。

In [16]:
counts = pd.Series([14, 14, 41234, 234, 3456, 902, np.nan])
counts

0       14.0
1       14.0
2    41234.0
3      234.0
4     3456.0
5      902.0
6        NaN
dtype: float64

In [18]:
counts.values

array([  1.40000000e+01,   1.40000000e+01,   4.12340000e+04,
         2.34000000e+02,   3.45600000e+03,   9.02000000e+02,
                    nan])

如果创建 Series 时 index 未指定，则默认采用 Interger 类型的 index

In [17]:
counts.index

RangeIndex(start=0, stop=7, step=1)

创建 Series 时 index 可以指定特定 meaningful labels 作为 index:

In [30]:
bacteria = pd.Series(data=[632, 1638, 569, 115],
                     index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])
bacteria

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
dtype: int64

可以根据 index 获取 Series 的 value

In [23]:
print counts[0], ', ', bacteria['Firmicutes']

14.0 ,  632


In [26]:
[name.endswith('bacteria') for name in bacteria.index]

[False, True, True, False]

In [27]:
bacteria[[name.endswith('bacteria') for name in bacteria.index]]

Proteobacteria    1638
Actinobacteria     569
dtype: int64

虽然指定了特定 label 的 index，但还可以通过 interger 类型的下标访问

In [28]:
print bacteria[0], ', ', bacteria['Firmicutes']

632 ,  632


可以指定 Series 的 values 数组 和 index 所表示的含义

In [63]:
bacteria.name = "细菌数量"
bacteria.index.name = '细菌种类'
bacteria
# 或可以创建时指定 name
# s = pd.Series(np.random.randn(5), name='something')

细菌种类
Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
Name: 细菌数量, dtype: int64

可以对 Series 数据结构的 value 应用 numpy 的 math 函数，结果仍然为 Series

In [33]:
np.log(bacteria)

细菌种类
Firmicutes        6.448889
Proteobacteria    7.401231
Actinobacteria    6.343880
Bacteroidetes     4.744932
Name: 细菌数量, dtype: float64

可以对 Series 的 value 进行过滤

In [35]:
bacteria[bacteria.values > 100]

细菌种类
Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
Name: 细菌数量, dtype: int64

Series 类似与 ndarray， 可以进行分片等操作

In [61]:
print bacteria[-1]
print '---------------------------'
print bacteria[:3]
print '---------------------------'
print bacteria[bacteria > bacteria.median()]

115
---------------------------
细菌种类
Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Name: 细菌数量, dtype: int64
---------------------------
细菌种类
Firmicutes         632
Proteobacteria    1638
Name: 细菌数量, dtype: int64


Series 可以看出一个存储 key-value 的数据结构，我们可以从一个 python 的 dict 创建一个 Series，且创建的 Series 会按照键值排序

In [40]:
bacteria_dict = {'Firmicutes': 632, 'Proteobacteria': 1638, 'Actinobacteria': 569, 'Bacteroidetes': 115}
pd.Series(bacteria_dict)

Actinobacteria     569
Bacteroidetes      115
Firmicutes         632
Proteobacteria    1638
dtype: int64

可以对传入的 dict 作筛选，对于未包含的 key， 其值为 NaN

In [42]:
bacteria1 = pd.Series(bacteria_dict, index=['Firmicutes','Proteobacteria','Actinobacteria'])
bacteria1

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
dtype: int64

In [45]:
bacteria2 = pd.Series(bacteria_dict, index=['otherkey', 'Firmicutes','Proteobacteria','Actinobacteria'])
bacteria2

otherkey             NaN
Firmicutes         632.0
Proteobacteria    1638.0
Actinobacteria     569.0
dtype: float64

In [46]:
bacteria2.isnull()

otherkey           True
Firmicutes        False
Proteobacteria    False
Actinobacteria    False
dtype: bool

当用于与其他系列对象的操作时，标签用于**对齐数据**，如进行相同标签数据求和。

In [53]:
bacteria1 + bacteria2

Actinobacteria    1138.0
Firmicutes        1264.0
Proteobacteria    3276.0
otherkey             NaN
dtype: float64

## DataFrame
DataFrame 是一个二维标记的数据结构，可以包含不同类型的列，类似与一个表格、SQL表或是一个 Series 类型的 dict，是 pandas 中最常用的数据结构。

In [72]:
# 每列要包含相同长度的 array ： arrays must all be same length
data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
                     'patient':[1, 1, 1, 1, 2, 2, 2, 2],
                     'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria', 
    'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']})
data

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,1130
6,2,Actinobacteria,754
7,2,Bacteroidetes,555


创建的 DataFrame 默认通过 column 排序，可以通过指定 index 显示的顺序：

In [73]:
data[['value','phylum','patient']]

Unnamed: 0,value,phylum,patient
0,632,Firmicutes,1
1,1638,Proteobacteria,1
2,569,Actinobacteria,1
3,115,Bacteroidetes,1
4,433,Firmicutes,2
5,1130,Proteobacteria,2
6,754,Actinobacteria,2
7,555,Bacteroidetes,2


In [74]:
data.columns

Index([u'patient', u'phylum', u'value'], dtype='object')

In [75]:
data[data.columns]

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,1130
6,2,Actinobacteria,754
7,2,Bacteroidetes,555


可以通过 `data.column` 或 `data['column']` 获取指定列的数据

In [80]:
print data.phylum
print '-----------------------'
print data['phylum']
print type(data.phylum)

0        Firmicutes
1    Proteobacteria
2    Actinobacteria
3     Bacteroidetes
4        Firmicutes
5    Proteobacteria
6    Actinobacteria
7     Bacteroidetes
Name: phylum, dtype: object
-----------------------
0        Firmicutes
1    Proteobacteria
2    Actinobacteria
3     Bacteroidetes
4        Firmicutes
5    Proteobacteria
6    Actinobacteria
7     Bacteroidetes
Name: phylum, dtype: object
<class 'pandas.core.series.Series'>


In [83]:
data

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,1130
6,2,Actinobacteria,754
7,2,Bacteroidetes,555


可以通过 DataFrame 的 **ix** 属性获取每一行的数据，且支持分片操作：

In [93]:
print data.ix[0]
print '---------------------------------'
print data.ix[2:5]

patient             1
phylum     Firmicutes
value             632
Name: 0, dtype: object
---------------------------------
   patient          phylum  value
2        1  Actinobacteria    569
3        1   Bacteroidetes    115
4        2      Firmicutes    433
5        2  Proteobacteria   1130


可通过一个 dict of ditcs 创建一个复杂结构的 DataFrame

In [169]:
data = pd.DataFrame({'0': {'patient': 1, 'phylum': 'Firmicutes', 'counts': 632},
                     '1': {'patient': 1, 'phylum': 'Proteobacteria', 'counts': 1638},
                     '2': {'patient': 1, 'phylum': 'Actinobacteria', 'counts': 569},
                     '3': {'patient': 1, 'phylum': 'Bacteroidetes', 'counts': 115},
                     '4': {'patient': 2, 'phylum': 'Firmicutes', 'counts': 433},
                     '5': {'patient': 2, 'phylum': 'Proteobacteria', 'counts': 1130},
                     '6': {'patient': 2, 'phylum': 'Actinobacteria', 'counts': 754},
                     '7': {'patient': 2, 'phylum': 'Bacteroidetes', 'counts': 555}})
data

Unnamed: 0,0,1,2,3,4,5,6,7
counts,632,1638,569,115,433,1130,754,555
patient,1,1,1,1,2,2,2,2
phylum,Firmicutes,Proteobacteria,Actinobacteria,Bacteroidetes,Firmicutes,Proteobacteria,Actinobacteria,Bacteroidetes


In [170]:
data = data.T
data

Unnamed: 0,counts,patient,phylum
0,632,1,Firmicutes
1,1638,1,Proteobacteria
2,569,1,Actinobacteria
3,115,1,Bacteroidetes
4,433,2,Firmicutes
5,1130,2,Proteobacteria
6,754,2,Actinobacteria
7,555,2,Bacteroidetes


In [171]:
vals = data['counts']
vals

0     632
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: counts, dtype: object

In [172]:
# 会修改内存中的数据，同时修改原始的 DataFrame 的数据
vals[0] = 1000
print vals
print '------------------------------------------------------------------'
print data

0    1000
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: counts, dtype: object
------------------------------------------------------------------
  counts patient          phylum
0   1000       1      Firmicutes
1   1638       1  Proteobacteria
2    569       1  Actinobacteria
3    115       1   Bacteroidetes
4    433       2      Firmicutes
5   1130       2  Proteobacteria
6    754       2  Actinobacteria
7    555       2   Bacteroidetes


In [173]:
# copy 之后在对其修改，由于是不同的内存空间，所以 DataFrame 不会被修改
vals = data.counts.copy()
vals[0] = -1
print vals
print '------------------------------------------------------------------'
print data

0      -1
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: counts, dtype: object
------------------------------------------------------------------
  counts patient          phylum
0   1000       1      Firmicutes
1   1638       1  Proteobacteria
2    569       1  Actinobacteria
3    115       1   Bacteroidetes
4    433       2      Firmicutes
5   1130       2  Proteobacteria
6    754       2  Actinobacteria
7    555       2   Bacteroidetes


可以通过赋值的方式新增或修改一列数据。
注意**新增一列只能通过 data['new_column'] 的方式**，而不能通过 data.new_column

In [174]:
data['year'] = 2017
# 虽不会报错，但不能添加一列
data.month = 1
data

Unnamed: 0,counts,patient,phylum,year
0,1000,1,Firmicutes,2017
1,1638,1,Proteobacteria,2017
2,569,1,Actinobacteria,2017
3,115,1,Bacteroidetes,2017
4,433,2,Firmicutes,2017
5,1130,2,Proteobacteria,2017
6,754,2,Actinobacteria,2017
7,555,2,Bacteroidetes,2017


新增的一列可以是一个 Series

In [175]:
treatment = pd.Series([0]*4 + [1]*2)
treatment

0    0
1    0
2    0
3    0
4    1
5    1
dtype: int64

In [176]:
data['treatment'] = treatment
data

Unnamed: 0,counts,patient,phylum,year,treatment
0,1000,1,Firmicutes,2017,
1,1638,1,Proteobacteria,2017,
2,569,1,Actinobacteria,2017,
3,115,1,Bacteroidetes,2017,
4,433,2,Firmicutes,2017,
5,1130,2,Proteobacteria,2017,
6,754,2,Actinobacteria,2017,
7,555,2,Bacteroidetes,2017,


对于 python 的其他数据类型，添加列时，其长度需要匹配否则会报错

In [177]:
month = ['1', '2', '3', '4', '5', '6', '7', '8']
data['month'] = month

In [178]:
data

Unnamed: 0,counts,patient,phylum,year,treatment,month
0,1000,1,Firmicutes,2017,,1
1,1638,1,Proteobacteria,2017,,2
2,569,1,Actinobacteria,2017,,3
3,115,1,Bacteroidetes,2017,,4
4,433,2,Firmicutes,2017,,5
5,1130,2,Proteobacteria,2017,,6
6,754,2,Actinobacteria,2017,,7
7,555,2,Bacteroidetes,2017,,8


使用 `del` 输出 DataFrame 的某一列

In [179]:
del data['treatment']
data

Unnamed: 0,counts,patient,phylum,year,month
0,1000,1,Firmicutes,2017,1
1,1638,1,Proteobacteria,2017,2
2,569,1,Actinobacteria,2017,3
3,115,1,Bacteroidetes,2017,4
4,433,2,Firmicutes,2017,5
5,1130,2,Proteobacteria,2017,6
6,754,2,Actinobacteria,2017,7
7,555,2,Bacteroidetes,2017,8


可以通过 DataFrame.values 将一个 DataFrame 转化为一个 ndarray 数组:

In [180]:
data.values

array([[1000, 1, 'Firmicutes', 2017, '1'],
       [1638, 1, 'Proteobacteria', 2017, '2'],
       [569, 1, 'Actinobacteria', 2017, '3'],
       [115, 1, 'Bacteroidetes', 2017, '4'],
       [433, 2, 'Firmicutes', 2017, '5'],
       [1130, 2, 'Proteobacteria', 2017, '6'],
       [754, 2, 'Actinobacteria', 2017, '7'],
       [555, 2, 'Bacteroidetes', 2017, '8']], dtype=object)