In [11]:
import pandas as pd
df = pd.read_csv('https://bml-data.s3.amazonaws.com/realtor_importio_cleaned.csv', index_col=False, encoding='utf8')
df = df[['address', 'bedrooms', 'full_bathrooms', 'half_bathrooms', 'type', 'size_sqft', 'lot_size', 'price']] # only keep the columns we're interested in and reorder them to have the price (target output) at the end
# df.drop_duplicates(inplace=True) # remove duplicates in case there are any

In [14]:
df.shape

(4776, 8)

In [13]:
df[df.duplicated()]

Unnamed: 0,address,bedrooms,full_bathrooms,half_bathrooms,type,size_sqft,lot_size,price


In [15]:
df[df.duplicated(subset=['size_sqft'])]

Unnamed: 0,address,bedrooms,full_bathrooms,half_bathrooms,type,size_sqft,lot_size,price
23,8095 Foxland St Las Vegas NV 89131,5,3,1,Single Family Home,4100,20037.6,479900
35,905 Boulder Springs Dr Unit 201 Las Vegas NV 8...,2,2,0,Condo/Townhome/Row Home/Co-Op,1136,,89000
39,3104 Cypress Ave Las Vegas NV 89030,3,2,0,Mfd/Mobile Home,980,5227.0,70000
89,4909 Braeburn Dr Las Vegas NV 89130,3,2,0,Single Family Home,1712,6969.0,189000
116,7936 Meandering Light Ave Las Vegas NV 89131,4,3,0,Single Family Home,2220,4356.0,239000
119,2777 Paradise Rd Unit 2903 Las Vegas NV 89109,2,2,1,Condo/Townhome/Row Home/Co-Op,2195,,499000
130,615 Franklin Ave Las Vegas NV 89104,3,2,0,Single Family Home,1510,7405.0,200000
131,10106 Quilt Tree St Las Vegas NV 89183,3,2,1,Condo/Townhome/Row Home/Co-Op,1510,2178.0,159900
147,4902 Perrone Ave Las Vegas NV 89141,3,2,0,Single Family Home,1405,4356.0,180000
215,5880 Bunch St Las Vegas NV 89122,8,4,0,Single Family Home,4265,35283.6,229000


In [3]:
df.shape

(4776, 8)

In [4]:
df.head()

Unnamed: 0,address,bedrooms,full_bathrooms,half_bathrooms,type,size_sqft,lot_size,price
0,6812 Mystic Plain Ct Las Vegas NV 89149,3,2,1,Single Family Home,1992,6098.0,220000
1,3416 Goldyke St Las Vegas NV 89115,3,2,0,Mfd/Mobile Home,1782,8276.0,60000
2,9222 Cowboy Rain Dr Las Vegas NV 89178,3,3,1,Single Family Home,1864,2178.0,173000
3,845 Trotter Cir Las Vegas NV 89107,5,4,1,Single Family Home,4608,15246.0,600000
4,10914 Iris Canyon Ln Las Vegas NV 89135,4,4,1,Single Family Home,3951,13503.6,799000


In [5]:
df.count()

address           4776
bedrooms          4776
full_bathrooms    4776
half_bathrooms    4776
type              4776
size_sqft         4776
lot_size          4058
price             4776
dtype: int64

In [6]:
df.dropna(subset=['bedrooms', 'full_bathrooms', 'size_sqft'], how='any', inplace = True)
df = df[df['size_sqft'] > 0]

In [7]:
df.count()

address           4776
bedrooms          4776
full_bathrooms    4776
half_bathrooms    4776
type              4776
size_sqft         4776
lot_size          4058
price             4776
dtype: int64

In [8]:
min_size_sqft = df['size_sqft'].min()
min_size_sqft

325.0

In [9]:
df['lot_size'].min()

435.0

In [10]:
df[:5]

Unnamed: 0,address,bedrooms,full_bathrooms,half_bathrooms,type,size_sqft,lot_size,price
0,6812 Mystic Plain Ct Las Vegas NV 89149,3,2,1,Single Family Home,1992,6098.0,220000
1,3416 Goldyke St Las Vegas NV 89115,3,2,0,Mfd/Mobile Home,1782,8276.0,60000
2,9222 Cowboy Rain Dr Las Vegas NV 89178,3,3,1,Single Family Home,1864,2178.0,173000
3,845 Trotter Cir Las Vegas NV 89107,5,4,1,Single Family Home,4608,15246.0,600000
4,10914 Iris Canyon Ln Las Vegas NV 89135,4,4,1,Single Family Home,3951,13503.6,799000


### duplicated 问题


In [16]:
data={'key1':[1,2,3,1,2,3,2,2],'key2':[2,2,1,2,2,4,2,2],'data':[5,6,2,6,1,6,2,8]}
frame=pd.DataFrame(data,columns=['key1','key2','data'])
print frame

   key1  key2  data
0     1     2     5
1     2     2     6
2     3     1     2
3     1     2     6
4     2     2     1
5     3     4     6
6     2     2     2
7     2     2     8


In [17]:
frame[frame.duplicated(['key1','key2'])]

Unnamed: 0,key1,key2,data
3,1,2,6
4,2,2,1
6,2,2,2
7,2,2,8


发现duplicated的函数并不是自己开始理解的样子；

一开始以为是过滤所有重复的row，只返回第一条记录。（与dropduplicated不一样）

真实的意义是所有重复的row，drop掉第一条记录。

frame.duplicated(['key1','key2'], take_last=True)

的话，就是drop掉最后一条|

In [20]:
frame[frame.duplicated(['key1','key2'], take_last=True)]

Unnamed: 0,key1,key2,data
0,1,2,5
1,2,2,6
4,2,2,1
6,2,2,2


使用or就能得到所有重复的row了：

我们最希望的当然是frame.duplicated(['key1','key2'])能够直接得出。。

In [21]:
frame[frame.duplicated(['key1','key2'], take_last=True) | frame.duplicated(['key1','key2'])]

Unnamed: 0,key1,key2,data
0,1,2,5
1,2,2,6
3,1,2,6
4,2,2,1
6,2,2,2
7,2,2,8


在得到所有重复的row后，直接使用groupby就能得到重复key的最小集合了。

实例中的min()和max()只是基于data来获取对应的代表row

In [22]:
frame[frame.duplicated(['key1', 'key2'], take_last=True) | frame.duplicated(['key1', 'key2'])].groupby(('key1', 'key2')).min()

Unnamed: 0_level_0,Unnamed: 1_level_0,data
key1,key2,Unnamed: 2_level_1
1,2,5
2,2,1


In [23]:
frame[frame.duplicated(['key1', 'key2'], take_last=True) | frame.duplicated(['key1', 'key2'])].groupby(('key1', 'key2')).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,data
key1,key2,Unnamed: 2_level_1
1,2,6
2,2,8


### 再来看看DataFrame.drop_duplicates()
关键的参数还是subset，take_last和inplace

In [24]:
frame

Unnamed: 0,key1,key2,data
0,1,2,5
1,2,2,6
2,3,1,2
3,1,2,6
4,2,2,1
5,3,4,6
6,2,2,2
7,2,2,8


In [25]:
frame1 = frame.copy()
# 得到frame的一个独立copy

In [26]:
frame1.drop_duplicates(inplace=True)
# 默认是匹配所有column，当然没有一个是重复项

In [27]:
frame1

Unnamed: 0,key1,key2,data
0,1,2,5
1,2,2,6
2,3,1,2
3,1,2,6
4,2,2,1
5,3,4,6
6,2,2,2
7,2,2,8


In [28]:
frame1.drop_duplicates(inplace=True, subset=['key1', 'key2'])

In [29]:
frame1

Unnamed: 0,key1,key2,data
0,1,2,5
1,2,2,6
2,3,1,2
5,3,4,6


所有重复列使用第一个记录作为代表被保留；

当设置take_last为True的时候，将使用最后一条记录作为代表。

### 数据的筛选，过滤和清理中其他用法
按自己实际项目中出现的顺序进行收集整理
##### drop()
去除处理过后不需要的列，aixs参数很重要，不指定的话，默认是删除row的

In [30]:
frame1.drop(['key1'], axis=1, inplace=True)

In [31]:
frame1

Unnamed: 0,key2,data
0,2,5
1,2,6
2,1,2
5,4,6


##### dripna()
把存在控制的row都删除

In [34]:
frame2 = frame1.append(pd.DataFrame({'key2':[1]}))

In [35]:
frame2

Unnamed: 0,data,key2
0,5.0,2
1,6.0,2
2,2.0,1
5,6.0,4
0,,1


In [36]:
frame2.dropna()

Unnamed: 0,data,key2
0,5,2
1,6,2
2,2,1
5,6,4


In [38]:
import numpy as np

##### dtypes的处理
读入数据后最重要的操作是明确好所输入数据的所有dtypes；

只有明确好dtypes后才能继续各种数据的运算或者比较操作，要不然很容易出现意外的结果并且难以察觉，例如string 的0 与数字的0去做比较

The main types stored in pandas objects are **float, int, bool, datetime64[ns], timedelta[ns] and object**. 

In addition these dtypes have item sizes, e.g. int64 and int32. 

A convenient dtypes` attribute for DataFrames returns a Series with the data type of each column.

In [39]:
dft = pd.DataFrame(dict(A = np.random.rand(3),
                        B = 1,
                        C = 'foo',
                        D = pd.Timestamp('20010102'),
                        E = pd.Series([1.0]*3).astype('float32'),
                        F = False,
                        G = pd.Series([1]*3,dtype='int8')))

In [40]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [41]:
dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.981735,1,foo,2001-01-02,1,False,1
1,0.542634,1,foo,2001-01-02,1,False,1
2,0.364084,1,foo,2001-01-02,1,False,1


By default integer types are int64 and float types are float64, **REGARDLESS of platform (32-bit or 64-bit)**. The following will all result in int64 dtypes.

In [42]:
pd.DataFrame([1, 2], columns=['a']).dtypes

a    int64
dtype: object

当一个列中的数据包含多种dtypes的时候，pandas会选择最合适的一个去表示这个列的dtypes。

一般会是object

In [43]:
pd.DataFrame([1, 2, 'abc'], columns=['a']).dtypes

a    object
dtype: object

In [44]:
pd.DataFrame([1, 2, 1.], columns=['a']).dtypes

a    float64
dtype: object

In [46]:
test = pd.DataFrame([1, 2, np.NaN], columns=['a'])
test.dtypes

a    float64
dtype: object

以上，有NaN的情况下，dtypes不是int64

使用**astype**进行dtypes的显性切换，默认返回的是一个copy。

In [47]:
test.astype('float32').dtypes

a    float32
dtype: object

In [49]:
test.dtypes
# test 本身不会改变，因为返回的是一个test的copy

a    float64
dtype: object

In [51]:
# 项目中的做法一般是：
test = test.astype('float32')

In [52]:
test.astype('int32').dtypes
# 把包含Na的列转换成int时会出现以下exception
# 参考http://stackoverflow.com/questions/21287624/pandas-dataframe-column-type-conversion

ValueError: Cannot convert NA to integer

使用**convert_objects()**转换df中的object dtypes

To force conversion of specific types that are number like, e.g. could be a string that represents a number, pass convert_numeric=True. This will force strings and numbers alike to be numbers if possible, otherwise they will be set to np.nan.

In [54]:
df = pd.DataFrame(dict(A = ['1','2','3'],
                        B = ['1.0','2','3']
                        ))

In [55]:
df.dtypes

A    object
B    object
dtype: object

DataFrame.convert_objects(**convert_dates=True, convert_numeric=False, convert_timedeltas=True, copy=True**)

In [58]:
df.convert_objects().dtypes

A    object
B    object
dtype: object

In [59]:
df.convert_objects(convert_numeric=True).dtypes

A      int64
B    float64
dtype: object

**很多时候当从文件中读入数据的时候，有些数据的类型已经转好了，这个过程是随机的还是有什么默认设置？**

**api说明：**

dtype: A data type name or a dict of column name to data type. If not specified, data types will be **inferred**. (Unsupported with engine='python')

注意，该方法返回同样是个copy

To force conversion to datetime64[ns], pass **convert_dates='coerce'**. This will convert any datetime-like object to dates, forcing other values to **NaT**. This might be useful if you are reading in data which is mostly dates, but occasionally has non-dates intermixed and you want to represent as missing.

强制转换，不符合者变为NaT

In [60]:
import datetime
s = pd.Series([datetime.datetime(2001,1,1,0,0),
                  'foo', 1.0, 1, pd.Timestamp('20010104'),
                  '20010105'], dtype='O')

In [61]:
s

0    2001-01-01 00:00:00
1                    foo
2                      1
3                      1
4    2001-01-04 00:00:00
5               20010105
dtype: object

In [62]:
s.convert_objects().dtypes

dtype('O')

In [63]:
s.convert_objects(convert_dates='coerce').dtypes

dtype('<M8[ns]')

In [64]:
ss = s.convert_objects(convert_dates='coerce')
ss

0   2001-01-01
1          NaT
2          NaT
3          NaT
4   2001-01-04
5   2001-01-05
dtype: datetime64[ns]

#### 列内容的转换
* 例如截取某段string作为该cell的内容
* 通过某些操作返回的结果
* 由其他列的分裂而来（Data Input 里面str split的例子）

In [69]:
# 使用str的相关操作进行数据的整理
df = pd.DataFrame(dict(A = ['uid=92872850', 'uid=92872250'],
                      B = ['client_type=2', 'client_type=4']))

In [70]:
df

Unnamed: 0,A,B
0,uid=92872850,client_type=2
1,uid=92872250,client_type=4


In [71]:
df.A = df.A.str.split('uid=').str[1]
df

# 使用map方法更加好

Unnamed: 0,A,B
0,92872850,client_type=2
1,92872250,client_type=4


In [72]:
df.B = df.B.str.split('client_type=').str[1]
df

Unnamed: 0,A,B
0,92872850,2
1,92872250,4


In [73]:
df.dtypes

A    object
B    object
dtype: object

In [75]:
df = df.convert_objects(convert_numeric=True)
df.dtypes

A    int64
B    int64
dtype: object

##### 使用apply()针对每个列进行相应操作并获取返回结果


In [77]:
uid_location_map = {92872850: 'SG',
                   92872250: 'TH'}

In [80]:
def get_location_from_uid (row):
    if row.A:
        return uid_location_map[row.A]
    else:
        return 'OT'

In [81]:
df['location'] = df.apply(get_location_from_uid, axis=1)

In [82]:
df

Unnamed: 0,A,B,location
0,92872850,2,SG
1,92872250,4,TH


apply 除了接受整个row作为整个row作为参数还可以输入指定列

In [83]:
type_id_map = {2: 'hehe',
                   4: 'haha'}

def get_type_from_id (id):
    return type_id_map[id]

In [85]:
pd.__version__

'0.16.2'

In [86]:
df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                       'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                       'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df
    

Unnamed: 0,one,three,two
a,-2.216926,,-0.386952
b,0.964506,-0.0234,-2.034587
c,-1.045282,-0.617807,0.065374
d,,-1.3595,-0.370013


In [87]:
df.apply(np.mean)

one     -0.765900
three   -0.666902
two     -0.681545
dtype: float64

In [88]:
df.apply(np.mean, axis=1)

a   -1.301939
b   -0.364493
c   -0.532572
d   -0.864756
dtype: float64

In [89]:
df.apply(lambda x: x.max() - x.min())

one      3.181432
three    1.336100
two      2.099961
dtype: float64

In [90]:
df.apply(np.cumsum)

Unnamed: 0,one,three,two
a,-2.216926,,-0.386952
b,-1.25242,-0.0234,-2.421539
c,-2.297701,-0.641207,-2.356166
d,,-2.000707,-2.726179


In [91]:
df.apply(np.exp)

Unnamed: 0,one,three,two
a,0.108944,,0.679124
b,2.623492,0.976872,0.130734
c,0.351593,0.539125,1.067558
d,,0.256789,0.690725


In [92]:
tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'],
                        index=pd.date_range('1/1/2000', periods=1000))

In [93]:
tsdf.head()

Unnamed: 0,A,B,C
2000-01-01,-0.200289,-0.1219,-0.614935
2000-01-02,-1.078862,0.881257,-0.13982
2000-01-03,1.498125,-1.245906,0.370689
2000-01-04,0.111276,0.668996,-0.142048
2000-01-05,-2.186422,-0.189576,0.399198


In [95]:
tsdf.A.max()

3.187539164061755

In [96]:
tsdf.apply(lambda x: x.idxmax())

A   2002-06-18
B   2000-11-12
C   2001-09-11
dtype: datetime64[ns]

In [97]:
tsdf.A['2002-06-18']

3.187539164061755

In [98]:
tsdf.idxmax()

A   2002-06-18
B   2000-11-12
C   2001-09-11
dtype: datetime64[ns]

In [99]:
tsdf.A.idxmax()

Timestamp('2002-06-18 00:00:00', offset='D')

In [100]:
t = pd.Series({'six' : 6., 'seven' : 7.})

In [101]:
t

seven    7
six      6
dtype: float64

#### Function application 专题
从上面的apply引入，apply的处理对象一般是逐行或者逐列；

而在pandas中，有其他的方式应对其他不同的情形，这里作个总结：

* Tablewise Function Application: pipe()
* Row or Column-wise Function Application: apply()
* Elementwise function application: applymap()

##### pipe() 用于整个df作为参数的情况

一般用于简化方法链，项目中用得较少，暂略。

##### apply() 

具体的用法上面基本都覆盖了。

##### applymap()

Since not all functions can be vectorized (accept NumPy arrays and return another array or value), the methods applymap() on DataFrame and analogously map() on Series accept any Python function taking a single value and returning a single value. For example:

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                       'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                       'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [3]:
df

Unnamed: 0,one,three,two
a,0.023591,,0.551347
b,1.381978,1.704238,-1.064359
c,-0.965902,-0.526729,0.902722
d,,-0.794192,0.65817


In [4]:
f = lambda x: len(str(x))

In [5]:
df.one.map(f)

a    15
b    13
c    15
d     3
Name: one, dtype: int64

In [6]:
df.apply(f)

# 每一行或者列得到一个return

one      81
three    83
two      81
dtype: int64

In [7]:
df.applymap(f)

# 每一个cell得到一个return

Unnamed: 0,one,three,two
a,15,3,14
b,13,13,13
c,15,15,14
d,3,15,14


In [11]:
df.map(f)

# df 并不能直接使用map()

AttributeError: 'DataFrame' object has no attribute 'map'

#### Lambda, filter, reduce and map 专题

虽然和Pandas没有直接联系，但是在数据处理方面还是很常用的。

##### Lambda

Lambda的作用可以参考： http://stackoverflow.com/questions/890128/why-python-lambdas-are-useful

一个很重要的原因是function可以作为一个object，因而可以作为一个传入参数进行操作

In [8]:
f = lambda x: x**2 + 2*x - 5

# 格式为 lambda argument_list: expression 
# 前面是参数，可以是多个参数，后面是运算表达式

In [9]:
f(2)

3

In [10]:
f2 = lambda x, y : x + y
f2(6,9)

15

In [12]:
f3 = lambda x: x % 3 == 0
f3(3)

True

In [14]:
mult3 = filter(f3, [1, 2, 3, 4, 5, 6, 7, 8, 9])
mult3

[3, 6, 9]

##### map()

r = map(func, seq)

seq可以是list，输出new list

In [15]:
temp = (36.5, 37, 37.5,39)
map(f,temp)

[1400.25, 1438, 1476.25, 1594]

map可以接受多个list，前提是list的长度一样，结果仍然是相同长度的list

In [16]:
a = [1,2,3,4]
b = [17,12,11,10]
c = [-1,-4,5,9]
map(lambda x,y:x+y, a,b)

[18, 14, 14, 14]

In [17]:
map(lambda x,y,z:x+y+z, a,b,c)

[17, 10, 19, 23]

##### Filtering()

根据function返回true的条件进行过滤。

In [18]:
fib = [0,1,1,2,3,5,8,13,21,34,55]
result = filter(lambda x: x % 2, fib)
print result

[1, 1, 3, 5, 13, 21, 55]


In [19]:
result = filter(lambda x: x % 2 == 0, fib)
print result

[0, 2, 8, 34]


##### reduce() 

每两个进行折合，直到list结尾



In [20]:
reduce(lambda x,y: x+y, [47,11,42,13])

113

In [21]:
reduce(lambda x, y: x+y, range(1,101))

5050

#### value_counts()

一个方法直接返回所需查看的count，不需要自己每列每行单独的去count

在实际统计中十分好用，因为很多情况下需要统计某某类型的总数总量。


返回**Series**


In [22]:
data = np.random.randint(0, 7, size=50)

In [23]:
data

array([0, 1, 2, 3, 4, 2, 2, 0, 2, 6, 6, 2, 1, 5, 3, 0, 4, 1, 3, 2, 3, 0, 0,
       4, 5, 2, 5, 2, 4, 2, 4, 1, 2, 4, 1, 5, 0, 1, 4, 2, 4, 6, 0, 4, 0, 3,
       0, 3, 0, 3])

In [24]:
s = pd.Series(data)


In [25]:
s.value_counts()

2    11
0    10
4     9
3     7
1     6
5     4
6     3
dtype: int64

In [26]:
pd.value_counts(data)

# It can also be used as a function on regular arrays:

2    11
0    10
4     9
3     7
1     6
5     4
6     3
dtype: int64

In [27]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

In [29]:
s5.mode()

0    3
1    7
dtype: int64

In [39]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})

In [40]:
df5.head()

Unnamed: 0,A,B
0,3,-3
1,0,-8
2,5,-9
3,6,-1
4,2,13


In [41]:
df5.mode()

Unnamed: 0,A,B
0,4,6


In [45]:
df5.A.mode()

0    4
dtype: int32

In [43]:
df5.A.value_counts()

4    11
2    10
6     7
5     7
1     7
3     4
0     4
dtype: int64

In [38]:
df5.B.value_counts()

 6     4
 4     4
-8     3
 14    3
-4     3
-9     3
 9     3
-1     3
 13    2
 11    2
-7     2
-3     2
 10    2
 5     2
 0     2
-6     2
 2     2
 1     2
 8     1
 7     1
-5     1
-2     1
dtype: int64

In [44]:
df5.B.mode()

# 为什么这里的结果不是4,6

0    6
dtype: int32

In [4]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3]})
df.mode()

Unnamed: 0,A
0,1
1,2


In [5]:
df1 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})

In [38]:
x = df1.A.value_counts()

In [12]:
df1.B.value_counts()

 4     5
 1     4
 2     4
 14    4
 11    4
-9     3
-2     3
 3     3
-6     3
 7     2
-4     2
 13    2
 5     2
 0     1
 6     1
 8     1
 9     1
 12    1
-10    1
-3     1
-5     1
-1     1
dtype: int64

Similarly, you can get the **most** frequently occurring value(s) (the mode) of the values in a Series or DataFrame.

**Empty** if nothing has 2+ occurrences.

In [35]:
a = df1.mode()
a

Unnamed: 0,A,B
0,0,4


在实际统计中，value_counts()返回的是Series，如何使用这个Series？
#### 遍历
##### iteritems()
iteritems()可以遍历df或者series，需要注意的是每个遍历结果根据遍历的对象而定的：

Consistent with the dict-like interface, iteritems() iterates through key-value pairs:

* Series: (index, scalar value) pairs
* DataFrame: (column, Series) pairs
* Panel: (item, DataFrame) pairs

In [29]:
for column, series in a.iteritems():
    print column
    print series
    
# 遍历df的时候返回的是列名跟列内容的对

A
0    0
Name: A, dtype: int32
B
0    4
Name: B, dtype: int32


In [None]:
for a in x.iteritems():
    print a
# 当遍历series的时候，返回的是index与内容对
# 在stats中，一般用到的是这个，因为value_counts()的return内容是series

##### iterrows() 

allows you to iterate through the rows of a DataFrame as Series objects. It returns an iterator yielding each index value along with a Series containing the data in each row。

* 只对df有效
* iter出来的row不会保存dtype信息
* 使用itertuples更快更好

In [36]:
for x in a.iterrows():
    print x

(0, A    0
B    4
Name: 0, dtype: int32)


In [None]:
# 示例用法，source中有现成的csv要插入到db中：

result = {}
for index, row in data.iterrows():
    result[row.Date] = {location: {'starting': row.Starting, 'credit': row.Credit, 'debit': row.Debit, 'ending': row.Ending}}

#得到的row可以直接使用column取值并利用

##### itertuples()
The itertuples() method will return an iterator yielding a tuple for each row in the DataFrame. 

The first element of the tuple will be the row’s corresponding index value, while the remaining values are the row values.

In [37]:
for x in a.itertuples():
    print x

(0, 0, 4)


In [39]:
type(x)

pandas.core.series.Series

itertuples()不能作用于series上：

In [40]:
for y in x.itertuples():
    print y

AttributeError: 'Series' object has no attribute 'itertuples'

所以要遍历series的结果还是使用iteritems的好。

#### 节选columns或者rows

很多情况下需要节选：
* 删除满足某些条件的rows
* append df的时候只使用指定的几个columns
* to_csv等方法的时候只导出指定的几个columns
* ...

节选columns参考问题： http://stackoverflow.com/questions/11285613/selecting-columns

解决办法有两种：

* df1 = df[['a','b']]
* df1 = df.ix[:,0:2]

方法一可以直接使用column name， 直接返回一个copy, 对它的更改不会影响源df

方法二不需要提供column name， 返回的是一个view，当需要节省内存消耗的时候是个好选择。 如果也需要得到一个copy的话，可以使用df1 = df.ix[0,0:2].copy()

In [41]:
df = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})

In [44]:
df.head(3)

Unnamed: 0,A,B
0,5,8
1,3,-10
2,1,7


In [45]:
df1 = df[['A']]
df1.head(3)

Unnamed: 0,A
0,5
1,3
2,1


In [46]:
df2 = df.ix[:,0:1]
df2.head(3)

Unnamed: 0,A
0,5
1,3
2,1


In [47]:
df1._is_view

# df1 不是view而是一个copy

False

In [50]:
df1.set_value(0, 'A', 10)
df1.head(3)

# 因为是一个copy，update值的时候只会影响自身 而不会影响源df

Unnamed: 0,A
0,10
1,3
2,1


In [51]:
df.head(3)

Unnamed: 0,A,B
0,5,8
1,3,-10
2,1,7


In [48]:
df2._is_view

True

In [52]:
df2.set_value(0, 'A', 10)
df2.head(3)

Unnamed: 0,A
0,10
1,3
2,1


In [53]:
df.head(3)

Unnamed: 0,A,B
0,10,8
1,3,-10
2,1,7


给df的单个cell设置值可以参考： http://stackoverflow.com/questions/13842088/set-value-for-particular-cell-in-pandas-dataframe

#### Fast scalar value getting and setting

取纯value或者设置value的时候，官方推荐的方法是使用at或者iat.

at与iat的区别和loc与iloc的区别是一样的：一个是可以基于label取值一个可以基于int位置取值

Similarly to loc, at provides label based scalar lookups, while, iat provides integer based lookups analogously to iloc

官方link： http://pandas.pydata.org/pandas-docs/stable/indexing.html#fast-scalar-value-getting-and-setting

In [57]:
df1.iat[1, 0] = 111
df1.head(3)

Unnamed: 0,A
0,10
1,111
2,1


In [58]:
df.head(3)

Unnamed: 0,A,B
0,10,8
1,3,-10
2,1,7


In [59]:
df2.iat[1,0] = 111

In [60]:
df.head(3)

Unnamed: 0,A,B
0,10,8
1,111,-10
2,1,7
