In [1]:
#Series as one-dim array
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])

In [2]:
data['a':'c'],data[0:2],data[(data>0.3)&(data<0.8)]

(a    0.25
 b    0.50
 c    0.75
 dtype: float64, a    0.25
 b    0.50
 dtype: float64, b    0.50
 c    0.75
 dtype: float64)

In [4]:
#fancy indexing
data[['a','d']]

a    0.25
d    1.00
dtype: float64

# indexers:loc,iloc,ix

In [5]:
#some confusions
data = pd.Series(['a','b','c'],index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [6]:
#explicit index when indexing
data[1]

'a'

In [7]:
#implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

因为整数索引可能造成这样的迷惑，pandas提供了特殊的indexers属性的索引

In [8]:
#loc属性关联到显式的索引
data.loc[1]

'a'

In [9]:
data.loc[1:3]

1    a
3    b
dtype: object

In [10]:
#iloc属性关联到隐式的索引
data.iloc[1]

'b'

In [11]:
data.iloc[1:3]

3    b
5    c
dtype: object

# Data Selection in DataFrame

记住可以从两种角度看待dataframe，一种是二维数组，另一种是由Series构成的字典

In [13]:
#DataFrame as a dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
                'New York': 141297, 'Florida': 170312,
                'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                'New York': 19651127, 'Florida': 19552860,
                'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [15]:
data['area'],data.area

(California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64, California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64)

In [16]:
data['density'] = data['pop']/data['area']

In [17]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [18]:
#DataFrame as two-dimensional array

In [19]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [20]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [21]:
#隐式索引 position based indexing
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [22]:
#显式索引 label based indexing
data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [23]:
#The ix indexer allows a hybrid of these two approaches:
data.ix[:3,:'pop']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [24]:
data.loc[data.density>100,['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


# Operating on Data in Pandas

# Ufuncs:index preservation

NumPy ufunc will work on Pandas Series and DataFrame objects
我们可以对Series和DataFrame执行numpy的函数，返回的是另外的对象但是索引被保存下来

In [26]:
import pandas as pd
import numpy as np
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))

In [27]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),columns=['a','b','c','d'])

In [28]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [29]:
np.sin(df+4)

Unnamed: 0,a,b,c,d
0,-0.544021,0.420167,-0.279415,-0.544021
1,-0.99999,0.989358,0.656987,-0.99999
2,-0.99999,-0.279415,0.412118,0.989358


## UFuncs: Index Alignment

对二元操作来说，pandas会将索引自动对齐，这在数据量不全的时候很有帮助。

In [30]:
#UFuncs: Index Alignment
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                    'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                    'New York': 19651127}, name='population')
population/area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [31]:
A = pd.Series([2, 4, 6], index=[0, 1, 2]) 
B = pd.Series([1, 3, 5], index=[1, 2, 3]) 
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

如果想要填充空值，可以使用对象方法来替代操作符

In [32]:
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [33]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                columns=list('AB'))
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                columns=list('BAC'))
A+B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [34]:
fill = A.stack().mean()
fill

4.5

In [35]:
A.add(B,fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


# python操作符和对应的pandas对象方法

## Ufuncs: Operations Between DataFrame and Series

In [47]:
#根据broadcasting rules,subtraction between a two-dimensional array and one of its rows is applied row-wise
A = rng.randint(10, size=(3, 4))
A-A[0]

array([[2, 2, 0, 4],
       [9, 6, 9, 8],
       [6, 8, 7, 1]])

In [45]:
#In Pandas, the convention similarly operates row-wise by default
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]
#if you would like to operate column-wise,axis = 0
df.sub(df['R'],axis=0)

Unnamed: 0,Q,R,S,T
0,8,0,8,6
1,1,0,-7,0
2,5,0,-2,5


In [48]:
df

Unnamed: 0,Q,R,S,T
0,8,0,8,6
1,8,7,0,7
2,7,2,0,7


In [50]:
half = df.iloc[0,::2]
half

Q    8
S    8
Name: 0, dtype: int64

In [51]:
df -half

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,0.0,,-8.0,
2,-1.0,,-8.0,


# Handling Missing Data

## None: Pythonic missing data

In [None]:
#None is a object