In [142]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1.创建 Series 对象

In [143]:
s = pd.Series([2,5,7,1])
print(type(s))
s

<class 'pandas.core.series.Series'>


0    2
1    5
2    7
3    1
dtype: int64

Seriers的交互式显示的字符窜表示形式是索引在左边，值在右边。因为我们没有给数据指定索引，一个包含整数0到 N-1 （这里N是数据的长度）的默认索引被创建。 你可以分别的通过它的 values 和 index 属性来获取Series的数组表示和索引对象：

In [144]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [145]:
s.values

array([2, 5, 7, 1])

自定义索引

In [146]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

另一种思考的方式是，Series是一个定长的，有序的字典，因为它把索引和值映射起来了。它可以适用于许多期望一个字典的函数

In [147]:
print('b' in obj2)
print('f' in obj2)

True
False


可以通过传递字典来从这些数据创建一个Series

In [148]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

只传递一个字典的时候，结果Series中的索引将是排序后的字典的键

在这种情况下， sdata 中的3个值被放在了合适的位置，但因为没有发现对应于 ‘California’ 的值，就出现了 NaN; 在pandas中用函数 isnull 和 notnull 来检测数据丢失

In [149]:
states = ['California', 'Ohio', 'Oregon', 'Texas'] 
obj4 = pd.Series(sdata, index=states)
print(obj4)

pd.isnull(obj4) # obj4.isnull()

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

## 2. 操作 Series 对象

In [150]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

与正规的NumPy数组相比，你可以使用索引里的值来选择一个单一值或一个值集

In [151]:
print(obj2, end='\n\n')
obj2['c']

d    4
b    7
a   -5
c    3
dtype: int64



3

In [152]:
obj2[1]

7

In [153]:
obj2[['c', 'b', 'd']]

c    3
b    7
d    4
dtype: int64

In [232]:
obj['b':'c']

b    7.2
a   -5.3
c    3.6
dtype: float64

通过一个布尔数组过滤，纯量乘法，或使用数学函数，将会保持索引和值间的关联

In [154]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [155]:
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [156]:
obj2**2

d    16
b    49
a    25
c     9
dtype: int64

In [157]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

在许多应用中Series的一个重要功能是在算术用算中它会自动对齐不同索引的数据

In [158]:
print(obj3, end='\n\n')
print(obj4)

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [159]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

### Series的索引的赋值

In [160]:
print(obj2, end='\n\n')
obj2.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj2)

d    4
b    7
a   -5
c    3
dtype: int64

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


### values

In [176]:
obj2.values

array([ 4,  7, -5,  3])

## 3. 创建 DataFrame

一个Datarame表示一个表格，类似电子表格的数据结构，包含一个经过排序的列表集，它们没一个都可以有不同的类型值（数字，字符串，布尔等等）。Datarame有行和列的索引；它可以被看作是一个Series的字典（每个Series共享一个索引）。
因为DataFrame在内部把数据存储为一个二维数组的格式，因此你可以采用分层索引以表格格式来表示高维的数据。分层索引是后面章节的一个主题，并且是pandas中许多更先进的数据处理功能的关键因素。

用一个相等长度列表的字典或NumPy数组, 来构建一个DataFrame

由此产生的DataFrame和Series一样，它的索引会自动分配，并且对列进行了排序：

In [161]:
data = {'state':['Ohio', 'Ohio','Ohio','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


如果你设定了一个列的顺序，DataFrame的列将会精确的按照你所传递的顺序排列：

In [162]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [163]:
# 和Series一样，如果你传递了一个列，但不包括在 data 中，在结果中它会表示为NA值：
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt']) # data, colums=[...], index=[...]
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [164]:
frame2.index=['one','two','three','four','five']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


### DataFrame 检索

### 列检索
和Series一样，在DataFrame中的一列可以通过字典记法或属性来检索：

注意，返回的Series包含和DataFrame相同的索引，并它们的 name 属性也被正确的设置了。

In [165]:
print(frame.year,end='\n\n')
print(frame['state'], end='\n\n')
print(type(frame['state']))


0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

<class 'pandas.core.series.Series'>


### 行检索

行也可以使用一些方法通过位置或名字来检索，例如 loc, iloc

索引DataFrame时返回的列是底层数据的一个视窗，而不是一个拷贝。因此，任何在Series上的就地修改都会影响DataFrame。列可以使用Series的 copy 函数来显式的拷贝。

In [175]:
row = frame2.iloc[0]
print(type(row))
row

<class 'pandas.core.series.Series'>


year     2000
state    Ohio
pop       1.5
debt      NaN
Name: one, dtype: object

In [167]:
frame2.loc['four']

year       2001
state    Nevada
pop         2.4
debt        NaN
Name: four, dtype: object

loc可以获取多行数据

In [168]:
frame2.iloc[::-1]

Unnamed: 0,year,state,pop,debt
five,2002,Nevada,2.9,
four,2001,Nevada,2.4,
three,2002,Ohio,3.6,
two,2001,Ohio,1.7,
one,2000,Ohio,1.5,


In [169]:
frame2.loc['four':]

Unnamed: 0,year,state,pop,debt
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


loc扩展——索引某行某列

In [170]:
frame2.loc['four',['year','pop']]

year    2001
pop      2.4
Name: four, dtype: object

 loc扩展——索引某列

In [173]:
frame2.loc[:,['year','pop']]

Unnamed: 0,year,pop
one,2000,1.5
two,2001,1.7
three,2002,3.6
four,2001,2.4
five,2002,2.9


给一个不存在的列赋值，将会创建一个新的列。

In [171]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,,False


像字典一样 del 关键字将会删除列

In [172]:
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


### values

In [178]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan]], dtype=object)

## 4. 索引对象

pandas的索引对象用来保存坐标轴标签和其它元数据（如坐标轴名或名称）。构建一个Series或DataFrame时任何数组或其它序列标签在内部转化为索引：

In [188]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [192]:
print(obj.index)
print(obj.index[1:])

Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')


索引对象的不可变性非常重要，这样它可以在数据结构中结构中安全的共享

In [194]:
# 索引对象是不可变的，因此不能由用户改变
obj.index[1]='aa'

TypeError: Index does not support mutable operations

## 5. 重要的功能
### 重新索引

意味着使数据符合一个新的索引来构造一个新的对象

In [200]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [201]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [202]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

### 重新索引时的 内插或填充

对于 Series 来说

In [206]:
obj3 = pd.Series(['blue','purple','yellow'], index=[0,2,4])
print(obj3, end='\n\n')

0      blue
2    purple
4    yellow
dtype: object



In [211]:
# ffill 的方法来向前填充值, bfill 向后填充
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

对于 DataFrame 来说

In [216]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [231]:
frame.reindex(['a', 'b', 'c', 'd']) # frame.reindex(['a', 'b', 'c', 'd'], method='ffill')

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


使用 columns 关键字可以是列重新索引

In [226]:
frame.reindex(columns=['Texas','Utah','California'])

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


## 6. 实用查询

In [234]:
data = pd.io.parsers.read_csv('./train.csv')

In [235]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [236]:
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [237]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [239]:
data.loc[:,['Survived','Name','Sex']].head()

Unnamed: 0,Survived,Name,Sex
0,0,"Braund, Mr. Owen Harris",male
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female
2,1,"Heikkinen, Miss. Laina",female
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female
4,0,"Allen, Mr. William Henry",male


### 条件查询

In [248]:
data[data['Age']<20].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q


### 多条件查询

In [251]:
data[(data['Age']<20) & (data['Sex']=='female')].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
22,23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S


## 7. 实用统计

In [253]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [260]:
group = data.groupby(['Sex','Pclass'])
group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,1,469.212766,0.968085,34.611765,0.553191,0.457447,106.125798
female,2,443.105263,0.921053,28.722973,0.486842,0.605263,21.970121
female,3,399.729167,0.5,21.75,0.895833,0.798611,16.11881
male,1,455.729508,0.368852,41.281386,0.311475,0.278689,67.226127
male,2,447.962963,0.157407,30.740707,0.342593,0.222222,19.741782
male,3,455.51585,0.135447,26.507589,0.498559,0.224784,12.661633


In [266]:
type(group)

pandas.core.groupby.DataFrameGroupBy

In [272]:
data.drop('Age', axis=1).groupby('Sex').agg([np.mean, np.median])

Unnamed: 0_level_0,PassengerId,PassengerId,Survived,Survived,Pclass,Pclass,SibSp,SibSp,Parch,Parch,Fare,Fare
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,431.028662,414.5,0.742038,1,2.159236,2,0.694268,0,0.649682,0,44.479818,23.0
male,454.147314,464.0,0.188908,0,2.389948,3,0.429809,0,0.235702,0,25.523893,10.5


In [265]:
group.mean().loc['female', 1]

PassengerId    469.212766
Survived         0.968085
Age             34.611765
SibSp            0.553191
Parch            0.457447
Fare           106.125798
Name: (female, 1), dtype: float64