In [1]:
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

## Series类似一维数组

In [2]:
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3])

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

### 对各个数据点进行标记和索引

In [5]:
obj2 = Series([4,7,-5,4],index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    4
dtype: int64

In [6]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

#### 如果只传入字典，则结果Series中的索引就是原字典的键（有序排列）

In [7]:
states = ['Texas','Oregon','Utah',"ca"]
obj4 = Series(sdata,index=states)
obj4

Texas     71000.0
Oregon    16000.0
Utah       5000.0
ca            NaN
dtype: float64

In [8]:
pd.isnull( obj4) #检测是否为null

Texas     False
Oregon    False
Utah      False
ca         True
dtype: bool

In [9]:
pd.notnull(obj4)#检测不为空

Texas      True
Oregon     True
Utah       True
ca        False
dtype: bool

> Series对象本身是通过索引name属性来跟其他关键功能联系

In [10]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
Texas     71000.0
Oregon    16000.0
Utah       5000.0
ca            NaN
Name: population, dtype: float64

#### 索引可以通过赋值方式就地修改

In [11]:
obj.index = ['Bob','Steve','Jeff','Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

## DataFrame

DataFrame 是一 个 表格 型 的 数据 结构， 它 含有 一组 有序 的 列， 每 列 可以 是不同 的 值 类型（ 数值、 字符串、 布尔 值 等）。 DataFrame 既有 行 索引 也有 列 索引， 它 可以 被 看做 由 Series 组成 的 字典（ 共用 同一个 索引）。 跟 其他 类似 的 数据 结构 相比（ 如 R 的 data. frame）， DataFrame 中 面向 行 和 面向 列 的 操作 基本上 是 平衡 的。 其实， DataFrame 中的 数据 是以 一个 或 多个 二维 块 存放 的（ 而 不是 列表、字典 或 别的 一 维 数据 结构）。

In [12]:
#常见构建方式
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]} 
frame = DataFrame( data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


#### 指定列顺序排列

In [13]:
DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [14]:
frame2 = DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [15]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [16]:
frame2['state']#获取整个列

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [17]:
##索引字段!!!
frame2.ix['three']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [18]:
frame2['pop']

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
Name: pop, dtype: float64

In [19]:
###赋值
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [20]:
frame2['debt'] = np.arange(5.)##numpy 和pandas结合
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


###  赋值series并在所选的空位上填充缺失值

In [21]:
val = Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [22]:
frame2['eastern'] = frame2.state == 'Ohio' #先判断相等并赋值给新的列
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


#### 删除指定列

In [23]:
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [24]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

### 嵌套字典数据处理

In [25]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [26]:
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


> 外层键作为__列__,内层键作为__索引__

In [27]:
##转置
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [28]:
DataFrame(pop,index=[2001,2002,2003])##显式进行指定索引

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [29]:
##values返回二维ndarray
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

## 索引对象

pandas所用到的任何数组或其他序列的标签都会被转换成一个index

In [30]:
obj = Series(range(3),index=['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [31]:
index[1:]

Index(['b', 'c'], dtype='object')

> index对象是不可修改的

> 在设计pandas的时候，为了减少选取列出错，一般吧所有的标签索引都放入ix中。

## 算数运算和数据对齐

In [32]:
s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1


a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [33]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [34]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

> DataFrame不会出现上述问题

In [35]:
df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [36]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [37]:
df1+df2 #并集

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


#### 在算数方法中填充值，希望找不到时候填充特殊值比如0

In [38]:
df1 = DataFrame( np. arange( 12.). reshape((3,4)), columns= list('abcd'))
df2 = DataFrame( np. arange( 20.). reshape((4,5)), columns= list('abcde'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [39]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [40]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


## 函数应用和映射

In [41]:
frame = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.154143,0.803281,0.678923
Ohio,-0.011572,0.193957,-1.150912
Texas,2.133,-0.652021,0.318528
Oregon,-0.377818,1.805145,0.553595


使用numpy的ufuncs操作pandas对象

In [42]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.154143,0.803281,0.678923
Ohio,0.011572,0.193957,1.150912
Texas,2.133,0.652021,0.318528
Oregon,0.377818,1.805145,0.553595


In [44]:
f = lambda x:x.max() -x.min()
f

<function __main__.<lambda>>

In [45]:
frame.apply(f)

b    2.510818
d    2.457166
e    1.829835
dtype: float64

In [46]:
frame.apply(f,axis=1)

Utah      0.957424
Ohio      1.344868
Texas     2.785021
Oregon    2.182963
dtype: float64

许多最常见的数组统计已经实现，不需要使用apply方法。

In [48]:
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.15,0.8,0.68
Ohio,-0.01,0.19,-1.15
Texas,2.13,-0.65,0.32
Oregon,-0.38,1.81,0.55


注意和map方法区分

In [49]:
frame['e'].map(format)

Utah       0.68
Ohio      -1.15
Texas      0.32
Oregon     0.55
Name: e, dtype: object

### 排序和排名

In [51]:
obj = Series(range(4),index=['d','a','b','c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [53]:
frame = DataFrame(np.arange(8).reshape((2,4)),index=['threee','one'],columns=['d','a','b','c'])
frame

Unnamed: 0,d,a,b,c
threee,0,1,2,3
one,4,5,6,7


In [55]:
frame.sort_index() #索引排序

Unnamed: 0,d,a,b,c
one,4,5,6,7
threee,0,1,2,3


In [56]:
frame.sort_index(axis=1)#列排序

Unnamed: 0,a,b,c,d
threee,1,2,3,0
one,5,6,7,4


In [58]:
frame.sort_index(axis=1,ascending=False)#列降序排列

Unnamed: 0,d,c,b,a
threee,0,3,2,1
one,4,7,6,5


In [67]:
obj = Series([4,7,-3,2])
# obj.order()

In [72]:
obj = Series([4,np.nan,7,np.nan,-3,2])

In [74]:
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [75]:
frame.sort_index(by='b')#以某列排序

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [76]:
frame.sort_index(by=['a','b'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [78]:
obj = Series([7,-5,7,4,2,0,4])
obj.rank() #排名

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [80]:
obj.rank(method='first') #根据值在原数据中出现的顺序给出排名

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [81]:
obj.rank(ascending=False,method='max')#降序排列

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

## 带重复值的轴索引

In [None]:
obj = Series(range(5),index=['a','a','b','b','c'])
obj

In [83]:
obj.index.is_unique #查看是否唯一

False

## 排名，会破坏平级关系

In [2]:
frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [3]:
frame.rank(axis=1) #列，平均分类排名

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


- average:默认，按平均分配排名
- min：整个分组最小排名
- max:最大排名
- first:按值在原始数据中的出现顺序排序

In [4]:
frame.rank(method='first')

Unnamed: 0,a,b,c
0,1.0,3.0,2.0
1,3.0,4.0,3.0
2,2.0,1.0,4.0
3,4.0,2.0,1.0


### 带有重复值的轴索引

In [9]:
obj = Series( range( 5), index=['a', 'a', 'b', 'b', 'c'])

In [7]:
#is_unique属性告诉是否是唯一的
obj.index.is_unique

False

索引如果是重复值，那么会返回__Series__,如果是单值，那么返回一个__标量值__。

In [10]:
obj['a']

a    0
a    1
dtype: int32

In [11]:
obj['c']

4

In [23]:
df = DataFrame([[ 1.4, np. nan], [7.1, -4.5],  [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [19]:
#合计
df.sum()

one    9.25
two   -5.80
dtype: float64

In [20]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [21]:
##自动排除nan
df.mean(axis=1,skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [22]:
#达到最大，最小索引
df.idxmax()

one    b
two    d
dtype: object

In [24]:
#累计
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [25]:
#一次产生多个汇总
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## 相关系数和协方差

In [3]:
from pandas_datareader import data as pdr

获取雅虎股票价格

In [4]:
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: 
    all_data[ticker] = pdr.get_data_yahoo( ticker, '1/1/2000', '1/1/2010') 
price = DataFrame({ tic: data['Adj Close'] for tic, data in all_data.items()})
volume = DataFrame({ tic: data['Volume'] for tic, data in all_data.items()})

In [17]:
returns = price.pct_change()#计算价格百分数变化

In [18]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.03434,0.011117,0.004384,0.002587
2009-12-28,0.012294,0.007098,0.013326,0.005484
2009-12-29,-0.011861,-0.005571,-0.003476,0.007058
2009-12-30,0.012147,0.005376,0.005461,-0.013699
2009-12-31,-0.0043,-0.004416,-0.012597,-0.015504


Series的corr方法计算两个Series中重叠的，非NA的，按索引对齐的值的相关系数。

### 误差系数

In [20]:
returns.MSFT.corr(returns.IBM) 

0.49435864405078661

### 计算协方差

In [22]:
returns.MSFT.cov(returns.IBM)

0.00021582144509535466

In [23]:
returns.corr() #以dataframe方式返回

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.470676,0.412392,0.423598
GOOG,0.470676,1.0,0.390689,0.443587
IBM,0.412392,0.390689,1.0,0.494359
MSFT,0.423598,0.443587,0.494359,1.0


In [24]:
returns.cov() #以dataframe方式返回

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.00103,0.000303,0.000254,0.000309
GOOG,0.000303,0.00058,0.000142,0.000205
IBM,0.000254,0.000142,0.000369,0.000216
MSFT,0.000309,0.000205,0.000216,0.000516


In [25]:
###计算列或行和另外一个Series或DataFrame之间的相关系数
returns.corrwith(returns.IBM)

AAPL    0.412392
GOOG    0.390689
IBM     1.000000
MSFT    0.494359
dtype: float64

In [26]:
returns.corrwith(volume) #查看百分比

AAPL   -0.057665
GOOG    0.062647
IBM    -0.006592
MSFT   -0.014228
dtype: float64

In [8]:
## 唯一值、值计数以及成员资格
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [9]:
uniques = obj.unique()

In [10]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

### value_counts()计算各个值出现的频率

In [13]:
obj.value_counts()#默认降序

a    3
c    3
b    2
d    1
dtype: int64

In [14]:
pd.value_counts(obj.values,sort=False)

c    3
a    3
d    1
b    2
dtype: int64

### 生成柱状图

In [15]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})


In [16]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [17]:
result = data.apply(pd.value_counts).fillna(0)

In [18]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


## 过滤缺失的数据

In [20]:
from numpy import nan as NA
data =Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

> dropna()默认丢弃任何含有缺失值的行

### all值丢弃全部NA的行,axis丢弃列

In [24]:
data = DataFrame([[ 1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
print (data)
data.dropna(how='all')

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


### 填充缺失数据

In [26]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [34]:
#通过字典实现不同的列填充不同的值
data.fillna({1:0.5,2:-1})#第1列，填充0.5，第2列填充-1.0

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,-1.0
2,,0.5,-1.0
3,,6.5,3.0


In [37]:
# fillna默认返回新对象
_ = data.fillna(0,inplace=True)#用0替换nan
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


## 层次索引-很重要！

In [42]:
data = Series( np. random. randn( 10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

In [43]:
data

a  1    2.248200
   2   -0.005025
   3    1.535977
b  1   -1.539857
   2    1.243795
   3    0.062205
c  1   -0.658177
   2    0.681887
d  2    2.120474
   3    1.501634
dtype: float64

In [44]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [45]:
#选取子集
data['b']

1   -1.539857
2    1.243795
3    0.062205
dtype: float64

In [46]:
data['b':'c']

b  1   -1.539857
   2    1.243795
   3    0.062205
c  1   -0.658177
   2    0.681887
dtype: float64

In [47]:
data.unstack()#通过unstack重新安排到一个dataframe中

Unnamed: 0,1,2,3
a,2.2482,-0.005025,1.535977
b,-1.539857,1.243795,0.062205
c,-0.658177,0.681887,
d,,2.120474,1.501634


In [50]:
data.unstack().stack()

a  1    2.248200
   2   -0.005025
   3    1.535977
b  1   -1.539857
   2    1.243795
   3    0.062205
c  1   -0.658177
   2    0.681887
d  2    2.120474
   3    1.501634
dtype: float64

## 面板数据，三维

In [59]:
from pandas_datareader import data as pdr
pdata = pd.Panel(dict(( stk, pdr.get_data_yahoo( stk, '1/1/2009', '6/1/2012')) 
                      for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))


RemoteDataError: Unable to read URL: https://query1.finance.yahoo.com/v7/finance/download/DELL?period1=1230739200&period2=1338566399&interval=1d&events=history&crumb=FI27ew9Y44q

In [None]:
pdata

## 分块读取

In [61]:
pd.read_csv('./testData.csv', nrows= 5) #逐行读取数据

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [89]:
#逐块读取文件
chunker = pd.read_csv('./testData.csv',chunksize=1000)

In [90]:
chunker

<pandas.io.parsers.TextFileReader at 0x11be834a8>

In [91]:
#逐块读取后，需要对块进行迭代

In [92]:
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(),fill_value=0)
tot = tot.sort_values(ascending=False)

In [93]:
tot[:10]

R    1.0
Q    1.0
L    1.0
G    1.0
B    1.0
dtype: float64

## 将数据写出到文本格式

In [96]:
import sys
## 使用其他分隔符
data.to_csv(sys.stdout,sep='|')

a|1|2.2482004017012285
a|2|-0.0050249793538050674
a|3|1.5359771184000253
b|1|-1.5398565304197125
b|2|1.2437949575093932
b|3|0.062204919095293416
c|1|-0.6581766331851535
c|2|0.6818865338402221
d|2|2.1204739680415425
d|3|1.501634196625101
