# Pandas

## 引入约定

In [3]:
import pandas as pd
import numpy as np

In [4]:
from pandas import DataFrame,Series    #数据框、一维数组

## Series  一维数组

### 通过一维数组创建series

In [5]:
arr=np.arange(1,21)

In [6]:
series=Series(arr)  #创建一个一维数组对象

In [7]:
series

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
dtype: int32

### 查看索引

In [8]:
series.index

RangeIndex(start=0, stop=20, step=1)

### 通过字典创建Series

In [9]:
a_dict={'20171128':1000,'20171009':1050,'20170910':500}

In [10]:
series01=Series(a_dict)

In [11]:
series01

20170910     500
20171009    1050
20171128    1000
dtype: int64

In [12]:
series01.index

Index([u'20170910', u'20171009', u'20171128'], dtype='object')

### 数据index绑定

In [13]:
series1=Series([70,89,67],index=['张三','李四','王五'])

In [14]:
series1

张三    70
李四    89
王五    67
dtype: int64

In [15]:
series1.values  #查看数据列

array([70, 89, 67], dtype=int64)

In [16]:
series1.dtype

dtype('int64')

### 缺失值的检测

* isnull

In [17]:
series2=Series([1,2,3,4,np.NaN,5,6,7,8,9])

In [18]:
series2

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [19]:
series2.isnull()

0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [20]:
series2.notnull()

0     True
1     True
2     True
3     True
4    False
5     True
6     True
7     True
8     True
9     True
dtype: bool

* notnull

In [21]:
series2[pd.isnull(series2)]

4   NaN
dtype: float64

In [22]:
series2[pd.notnull(series2)]

0    1.0
1    2.0
2    3.0
3    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [41]:
product_num=Series([23,45,67,89],index=['p3','p1','p2','p4'])

In [42]:
product_price_table=Series([9.83,8.13,2.43,5.61,4.89],index=['p1','p2','p3','p4','p5'])

In [43]:
product_sum=product_num*product_price_table

In [44]:
product_sum

p1    442.35
p2    544.71
p3     55.89
p4    499.29
p5       NaN
dtype: float64

In [45]:
product_num.name='PN'

In [46]:
product_num.index.name='PT'

In [47]:
product_num

PT
p3    23
p1    45
p2    67
p4    89
Name: PN, dtype: int64

In [48]:
product_num['p1']

45

In [49]:
product_num[:'p3']

PT
p3    23
Name: PN, dtype: int64

In [50]:
product_num['p3':
           ]

PT
p3    23
p1    45
p2    67
p4    89
Name: PN, dtype: int64

In [51]:
product_num[:'p2']

PT
p3    23
p1    45
p2    67
Name: PN, dtype: int64

In [52]:
product_num['p3':'p2']

PT
p3    23
p1    45
p2    67
Name: PN, dtype: int64

### dataframe

In [53]:
df=DataFrame([[1,2,3],[3,4,5]])

In [54]:
df

Unnamed: 0,0,1,2
0,1,2,3
1,3,4,5


In [55]:
df=DataFrame([[1,2,3],[3,4,5]],index=['a','b'])

In [56]:
df

Unnamed: 0,0,1,2
a,1,2,3
b,3,4,5


In [57]:
df[0]     #优先从列取值

a    1
b    3
Name: 0, dtype: int64

In [58]:
df[0]['a']   #取出第一列，a行的数据

1

In [59]:
df[1:]

Unnamed: 0,0,1,2
b,3,4,5


In [60]:
df[0:]   #如果使用切片，优先取行的数据   ix_对列进行切片

Unnamed: 0,0,1,2
a,1,2,3
b,3,4,5


In [61]:
df[1:][0:]

Unnamed: 0,0,1,2
b,3,4,5


### 通过二维数组创建DataFrame

In [62]:
arr=np.array([['Tom',76],['Jane',45],['Merry',100]])

In [63]:
df3=DataFrame(arr,columns=['name','score'])

In [64]:
df3

Unnamed: 0,name,score
0,Tom,76
1,Jane,45
2,Merry,100


In [65]:
df4=DataFrame(arr,index=['one','two','three'],columns=['name','score'])

In [66]:
df4

Unnamed: 0,name,score
one,Tom,76
two,Jane,45
three,Merry,100


### 通过字典创建DataFrame

In [67]:
data={'apart':[1001,1002,1004,1003],
     'profile':[1000,2000,1500,3500],
     'year':[2010,2011,2012,2013]}

In [68]:
df5=DataFrame(data)

In [69]:
df5

Unnamed: 0,apart,profile,year
0,1001,1000,2010
1,1002,2000,2011
2,1004,1500,2012
3,1003,3500,2013


In [70]:
df5.index

RangeIndex(start=0, stop=4, step=1)

In [71]:
df5.columns

Index([u'apart', u'profile', u'year'], dtype='object')

In [72]:
df5.values

array([[1001, 1000, 2010],
       [1002, 2000, 2011],
       [1004, 1500, 2012],
       [1003, 3500, 2013]], dtype=int64)

In [73]:
df6=DataFrame(data,index=['one','two','three','four'])

In [74]:
df6

Unnamed: 0,apart,profile,year
one,1001,1000,2010
two,1002,2000,2011
three,1004,1500,2012
four,1003,3500,2013


In [75]:
df6.index

Index([u'one', u'two', u'three', u'four'], dtype='object')

In [76]:
dict={'C++':[50,50,20],'JAVA':[20,30,40],'python':[10,20,30]}

In [77]:
s1=DataFrame(dict)

In [78]:
s1

Unnamed: 0,C++,JAVA,python
0,50,20,10
1,50,30,20
2,20,40,30


In [79]:
s1.index=[2,3,4]

In [80]:
s1.columns=['Java','C++','Vb']

In [81]:
s1

Unnamed: 0,Java,C++,Vb
2,50,20,10
3,50,30,20
4,20,40,30


In [82]:
s1['C++']=np.NaN

In [83]:
s1

Unnamed: 0,Java,C++,Vb
2,50,,10
3,50,,20
4,20,,30


In [84]:
n1=np.random.random((20,6))

In [85]:
s2=DataFrame(n1)

In [86]:
s2

Unnamed: 0,0,1,2,3,4,5
0,0.130861,0.608967,0.354778,0.444175,0.897828,0.864571
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272
3,0.962317,0.84601,0.264511,0.612034,0.397991,0.438498
4,0.852607,0.280385,0.285077,0.836417,0.098982,0.179859
5,0.915303,0.999453,0.207615,0.812667,0.445971,0.898004
6,0.57872,0.875084,0.916643,0.639012,0.842631,0.939124
7,0.084235,0.538655,0.575204,0.723118,0.592949,0.285705
8,0.203978,0.446714,0.446219,0.572212,0.829765,0.396672
9,0.82185,0.327375,0.516201,0.213658,0.383463,0.857646


In [87]:
s2.head()   #默认为前5行数据，可以自定义显示行数

Unnamed: 0,0,1,2,3,4,5
0,0.130861,0.608967,0.354778,0.444175,0.897828,0.864571
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272
3,0.962317,0.84601,0.264511,0.612034,0.397991,0.438498
4,0.852607,0.280385,0.285077,0.836417,0.098982,0.179859


In [88]:
s2.tail()  #默认为后5行，可自定义显示行数

Unnamed: 0,0,1,2,3,4,5
15,0.833681,0.08387,0.587713,0.840232,0.235971,0.259118
16,0.76072,0.927923,0.042855,0.475825,0.812414,0.617177
17,0.185196,0.328657,0.309027,0.208122,0.614992,0.892151
18,0.036514,0.123135,0.403208,0.03219,0.166991,0.081775
19,0.825121,0.849261,0.50544,0.29883,0.040016,0.630253


In [89]:
s2.describe()

Unnamed: 0,0,1,2,3,4,5
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.534037,0.517258,0.46452,0.435915,0.469388,0.482514
std,0.316124,0.291046,0.251012,0.256457,0.289966,0.300395
min,0.036514,0.08387,0.042855,0.03219,0.022856,0.053272
25%,0.248305,0.280032,0.279936,0.214744,0.218726,0.218224
50%,0.576843,0.491245,0.445434,0.424284,0.421981,0.442456
75%,0.827261,0.790966,0.622452,0.618778,0.708198,0.750206
max,0.962317,0.999453,0.916643,0.840232,0.897828,0.939124


### 按轴排序

In [90]:
s2.columns=['a','b','c','d','e','f']

In [91]:
s2

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.608967,0.354778,0.444175,0.897828,0.864571
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272
3,0.962317,0.84601,0.264511,0.612034,0.397991,0.438498
4,0.852607,0.280385,0.285077,0.836417,0.098982,0.179859
5,0.915303,0.999453,0.207615,0.812667,0.445971,0.898004
6,0.57872,0.875084,0.916643,0.639012,0.842631,0.939124
7,0.084235,0.538655,0.575204,0.723118,0.592949,0.285705
8,0.203978,0.446714,0.446219,0.572212,0.829765,0.396672
9,0.82185,0.327375,0.516201,0.213658,0.383463,0.857646


In [92]:
s2.sort_index(axis=1,ascending=False)
#axis代表轴，0为行轴，1为列轴
#ascending 排序的方式，默认为True，表示升序，False为降序

Unnamed: 0,f,e,d,c,b,a
0,0.864571,0.897828,0.444175,0.354778,0.608967,0.130861
1,0.508872,0.762493,0.25635,0.870167,0.451438,0.909209
2,0.053272,0.145866,0.194658,0.743195,0.700153,0.574966
3,0.438498,0.397991,0.612034,0.264511,0.84601,0.962317
4,0.179859,0.098982,0.836417,0.285077,0.280385,0.852607
5,0.898004,0.445971,0.812667,0.207615,0.999453,0.915303
6,0.939124,0.842631,0.639012,0.916643,0.875084,0.57872
7,0.285705,0.592949,0.723118,0.575204,0.538655,0.084235
8,0.396672,0.829765,0.572212,0.446219,0.446714,0.203978
9,0.857646,0.383463,0.213658,0.516201,0.327375,0.82185


### 按值排序

In [94]:
s2.sort(columns='b')

AttributeError: 'DataFrame' object has no attribute 'sort'

### 按照标签来获取一个交叉的区域

In [95]:
datas=[0,20,30]

In [96]:
s2.loc[0:5,['a']]   #提取0~5行的数据，再提取a列的数据
#标签的切片

Unnamed: 0,a
0,0.130861
1,0.909209
2,0.574966
3,0.962317
4,0.852607
5,0.915303


In [97]:
s2.loc[1,['a']]   #对于返回的对象进行维度缩减

a    0.909209
Name: 1, dtype: float64

In [98]:
s2.loc[0,'a']    #等同于如下效果,先行后列

0.13086085288390759

In [99]:
s2['a'][0]  #先列后行

0.13086085288390759

## 通过位置选择

### 通过传递数值进行位置选择（选择的是行）

In [100]:
s2.iloc[3]   #iloc选择行轴

a    0.962317
b    0.846010
c    0.264511
d    0.612034
e    0.397991
f    0.438498
Name: 3, dtype: float64

### 通过数值进行切片

In [101]:
s2.iloc[1:3]   #切片 行  1~3行 [1,3)

Unnamed: 0,a,b,c,d,e,f
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272


In [102]:
s2.iloc[1:3,2:4]  #切片  行和列  

Unnamed: 0,c,d
1,0.870167,0.25635
2,0.743195,0.194658


### 通知指定一个位置

In [103]:
s2.iloc[[1,2,3],[2,4]]   #切片，1，2，3行，2，4列

Unnamed: 0,c,e
1,0.870167,0.762493
2,0.743195,0.145866
3,0.264511,0.397991


### 行切片

In [104]:
s2.iloc[1:3,:]

Unnamed: 0,a,b,c,d,e,f
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272


### 列切片

In [105]:
s2.iloc[:,1:3]

Unnamed: 0,b,c
0,0.608967,0.354778
1,0.451438,0.870167
2,0.700153,0.743195
3,0.84601,0.264511
4,0.280385,0.285077
5,0.999453,0.207615
6,0.875084,0.916643
7,0.538655,0.575204
8,0.446714,0.446219
9,0.327375,0.516201


### 获取指定位置的值

In [106]:
s2.iloc[2,2]   #获取第3行第3列的值

0.74319464292447379

In [107]:
s2.iat[2,2]    #快速获取指定位置的数据

0.74319464292447379

### 布尔索引

#### 使用一个单独列的值来选择数据

In [108]:
s2[s2.a>0.5]

Unnamed: 0,a,b,c,d,e,f
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272
3,0.962317,0.84601,0.264511,0.612034,0.397991,0.438498
4,0.852607,0.280385,0.285077,0.836417,0.098982,0.179859
5,0.915303,0.999453,0.207615,0.812667,0.445971,0.898004
6,0.57872,0.875084,0.916643,0.639012,0.842631,0.939124
9,0.82185,0.327375,0.516201,0.213658,0.383463,0.857646
13,0.629659,0.772618,0.44465,0.215106,0.376171,0.446415
15,0.833681,0.08387,0.587713,0.840232,0.235971,0.259118
16,0.76072,0.927923,0.042855,0.475825,0.812414,0.617177


#### 整体过滤

In [109]:
s2[s2>0.5]    #把所有不满足条件的全部置空

Unnamed: 0,a,b,c,d,e,f
0,,0.608967,,,0.897828,0.864571
1,0.909209,,0.870167,,0.762493,0.508872
2,0.574966,0.700153,0.743195,,,
3,0.962317,0.84601,,0.612034,,
4,0.852607,,,0.836417,,
5,0.915303,0.999453,,0.812667,,0.898004
6,0.57872,0.875084,0.916643,0.639012,0.842631,0.939124
7,,0.538655,0.575204,0.723118,0.592949,
8,,,,0.572212,0.829765,
9,0.82185,,0.516201,,,0.857646


#### isin()  过滤数据

In [110]:
s3=s2.copy()
s3

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.608967,0.354778,0.444175,0.897828,0.864571
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272
3,0.962317,0.84601,0.264511,0.612034,0.397991,0.438498
4,0.852607,0.280385,0.285077,0.836417,0.098982,0.179859
5,0.915303,0.999453,0.207615,0.812667,0.445971,0.898004
6,0.57872,0.875084,0.916643,0.639012,0.842631,0.939124
7,0.084235,0.538655,0.575204,0.723118,0.592949,0.285705
8,0.203978,0.446714,0.446219,0.572212,0.829765,0.396672
9,0.82185,0.327375,0.516201,0.213658,0.383463,0.857646


In [111]:
s3.ix[21]=[1.23,2.34,3.45,5.6,4,8.1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [112]:
s3.ix[20]=[1,2,3,4,5,6]

In [113]:
s3

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.608967,0.354778,0.444175,0.897828,0.864571
1,0.909209,0.451438,0.870167,0.25635,0.762493,0.508872
2,0.574966,0.700153,0.743195,0.194658,0.145866,0.053272
3,0.962317,0.84601,0.264511,0.612034,0.397991,0.438498
4,0.852607,0.280385,0.285077,0.836417,0.098982,0.179859
5,0.915303,0.999453,0.207615,0.812667,0.445971,0.898004
6,0.57872,0.875084,0.916643,0.639012,0.842631,0.939124
7,0.084235,0.538655,0.575204,0.723118,0.592949,0.285705
8,0.203978,0.446714,0.446219,0.572212,0.829765,0.396672
9,0.82185,0.327375,0.516201,0.213658,0.383463,0.857646


In [114]:
s3[s3['a'].isin([1.0,1.23])]

Unnamed: 0,a,b,c,d,e,f
21,1.23,2.34,3.45,5.6,4.0,8.1
20,1.0,2.0,3.0,4.0,5.0,6.0


### 设置

### 新增一个列

In [115]:
s1=Series([i for i in range(20)])

In [116]:
s1

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
dtype: int64

In [117]:
s2.f=s1

In [118]:
s2

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.608967,0.354778,0.444175,0.897828,0
1,0.909209,0.451438,0.870167,0.25635,0.762493,1
2,0.574966,0.700153,0.743195,0.194658,0.145866,2
3,0.962317,0.84601,0.264511,0.612034,0.397991,3
4,0.852607,0.280385,0.285077,0.836417,0.098982,4
5,0.915303,0.999453,0.207615,0.812667,0.445971,5
6,0.57872,0.875084,0.916643,0.639012,0.842631,6
7,0.084235,0.538655,0.575204,0.723118,0.592949,7
8,0.203978,0.446714,0.446219,0.572212,0.829765,8
9,0.82185,0.327375,0.516201,0.213658,0.383463,9


### 通过标签设置值

In [119]:
s2.loc[20,'f']=13

In [120]:
s2

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.608967,0.354778,0.444175,0.897828,0.0
1,0.909209,0.451438,0.870167,0.25635,0.762493,1.0
2,0.574966,0.700153,0.743195,0.194658,0.145866,2.0
3,0.962317,0.84601,0.264511,0.612034,0.397991,3.0
4,0.852607,0.280385,0.285077,0.836417,0.098982,4.0
5,0.915303,0.999453,0.207615,0.812667,0.445971,5.0
6,0.57872,0.875084,0.916643,0.639012,0.842631,6.0
7,0.084235,0.538655,0.575204,0.723118,0.592949,7.0
8,0.203978,0.446714,0.446219,0.572212,0.829765,8.0
9,0.82185,0.327375,0.516201,0.213658,0.383463,9.0


### 通过位置

In [121]:
s2.iloc[20:20,5:6]=13

In [122]:
s2

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.608967,0.354778,0.444175,0.897828,0.0
1,0.909209,0.451438,0.870167,0.25635,0.762493,1.0
2,0.574966,0.700153,0.743195,0.194658,0.145866,2.0
3,0.962317,0.84601,0.264511,0.612034,0.397991,3.0
4,0.852607,0.280385,0.285077,0.836417,0.098982,4.0
5,0.915303,0.999453,0.207615,0.812667,0.445971,5.0
6,0.57872,0.875084,0.916643,0.639012,0.842631,6.0
7,0.084235,0.538655,0.575204,0.723118,0.592949,7.0
8,0.203978,0.446714,0.446219,0.572212,0.829765,8.0
9,0.82185,0.327375,0.516201,0.213658,0.383463,9.0


In [123]:
s2[s2>0.5]=0

In [124]:
s2

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.0,0.354778,0.444175,0.0,0.0
1,0.0,0.451438,0.0,0.25635,0.0,0.0
2,0.0,0.0,0.0,0.194658,0.145866,0.0
3,0.0,0.0,0.264511,0.0,0.397991,0.0
4,0.0,0.280385,0.285077,0.0,0.098982,0.0
5,0.0,0.0,0.207615,0.0,0.445971,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0
7,0.084235,0.0,0.0,0.0,0.0,0.0
8,0.203978,0.446714,0.446219,0.0,0.0,0.0
9,0.0,0.327375,0.0,0.213658,0.383463,0.0


In [125]:
s3=s2[s2.a<0.06]

In [126]:
s3

Unnamed: 0,a,b,c,d,e,f
1,0.0,0.451438,0.0,0.25635,0.0,0.0
2,0.0,0.0,0.0,0.194658,0.145866,0.0
3,0.0,0.0,0.264511,0.0,0.397991,0.0
4,0.0,0.280385,0.285077,0.0,0.098982,0.0
5,0.0,0.0,0.207615,0.0,0.445971,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.327375,0.0,0.213658,0.383463,0.0
13,0.0,0.0,0.44465,0.215106,0.376171,0.0
15,0.0,0.08387,0.0,0.0,0.235971,0.0
16,0.0,0.0,0.042855,0.475825,0.0,0.0


### 缺失值的处理

#### reindex() 方法可以对指定的轴上的索引进行修改（增加/删除）

In [136]:
s4=s2.reindex(index=[i for i in range(10)],columns=list(s2.columns)+['a'])

In [137]:
s4

Unnamed: 0,a,b,c,d,e,f,a.1
0,0.130861,0.0,0.354778,0.444175,0.0,0.0,0.130861
1,0.0,0.451438,0.0,0.25635,0.0,0.0,0.0
2,0.0,0.0,0.0,0.194658,0.145866,0.0,0.0
3,0.0,0.0,0.264511,0.0,0.397991,0.0,0.0
4,0.0,0.280385,0.285077,0.0,0.098982,0.0,0.0
5,0.0,0.0,0.207615,0.0,0.445971,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.084235,0.0,0.0,0.0,0.0,0.0,0.084235
8,0.203978,0.446714,0.446219,0.0,0.0,0.0,0.203978
9,0.0,0.327375,0.0,0.213658,0.383463,0.0,0.0


In [140]:
s4=DataFrame(np.array(
[
    [1,np.NaN,2,3,4],
    [1,2,3,4,np.NaN]
]))

In [141]:
s4

Unnamed: 0,0,1,2,3,4
0,1.0,,2.0,3.0,4.0
1,1.0,2.0,3.0,4.0,


In [143]:
s4.ix[1]

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
Name: 1, dtype: float64

In [144]:
s4.ix[2]=[1,2,3,4,5]

In [145]:
s4

Unnamed: 0,0,1,2,3,4
0,1.0,,2.0,3.0,4.0
1,1.0,2.0,3.0,4.0,
2,1.0,2.0,3.0,4.0,5.0


#### 去除包含缺失值的行

In [146]:
s4.dropna

<bound method DataFrame.dropna of      0    1    2    3    4
0  1.0  NaN  2.0  3.0  4.0
1  1.0  2.0  3.0  4.0  NaN
2  1.0  2.0  3.0  4.0  5.0>

#### 对缺失值的替换

In [147]:
s4

Unnamed: 0,0,1,2,3,4
0,1.0,,2.0,3.0,4.0
1,1.0,2.0,3.0,4.0,
2,1.0,2.0,3.0,4.0,5.0


In [148]:
s4.fillna(value=0)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,2.0,3.0,4.0
1,1.0,2.0,3.0,4.0,0.0
2,1.0,2.0,3.0,4.0,5.0


In [152]:
s5=DataFrame(np.array(
[
    [1,np.NaN,2,3,4],
    [1,2,3,4,np.NaN],
    [1,2,3,5,6]
]))

### 对数据进行布尔填充，空值的判断

In [153]:
pd.isnull(s5)

Unnamed: 0,0,1,2,3,4
0,False,True,False,False,False
1,False,False,False,False,True
2,False,False,False,False,False


### 其他操作

#### 数据描述性统计

In [159]:
s3.mean(1)   #对固定的轴进行统计操作

0    0.154969
1    0.117965
2    0.056754
3    0.110417
4    0.110741
5    0.108931
6    0.000000
7    0.014039
8    0.182819
9    0.154083
dtype: float64

In [155]:
s3

Unnamed: 0,a,b,c,d,e,f
0,0.130861,0.0,0.354778,0.444175,0.0,0.0
1,0.0,0.451438,0.0,0.25635,0.0,0.0
2,0.0,0.0,0.0,0.194658,0.145866,0.0
3,0.0,0.0,0.264511,0.0,0.397991,0.0
4,0.0,0.280385,0.285077,0.0,0.098982,0.0
5,0.0,0.0,0.207615,0.0,0.445971,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0
7,0.084235,0.0,0.0,0.0,0.0,0.0
8,0.203978,0.446714,0.446219,0.0,0.0,0.0
9,0.0,0.327375,0.0,0.213658,0.383463,0.0


In [156]:
s3.describe()

Unnamed: 0,a,b,c,d,e,f
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.041907,0.150591,0.15582,0.110884,0.147227,0.0
std,0.073231,0.200655,0.175284,0.157619,0.188163,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.103808,0.0,0.049491,0.0
75%,0.063176,0.315627,0.279936,0.208908,0.324064,0.0
max,0.203978,0.451438,0.446219,0.444175,0.445971,0.0


In [160]:
help(s3.mean)

Help on method mean in module pandas.core.generic:

mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs) method of pandas.core.frame.DataFrame instance
    Return the mean of the values for the requested axis
    
    Parameters
    ----------
    axis : {index (0), columns (1)}
    skipna : boolean, default True
        Exclude NA/null values. If an entire row/column is NA, the result
        will be NA
    level : int or level name, default None
        If the axis is a MultiIndex (hierarchical), count along a
        particular level, collapsing into a Series
    numeric_only : boolean, default None
        Include only float, int, boolean columns. If None, will attempt to use
        everything, then use only numeric data. Not implemented for Series.
    
    Returns
    -------
    mean : Series or DataFrame (if level specified)



In [161]:
s4.apply(np.cumsum)

Unnamed: 0,0,1,2,3,4
0,1.0,,2.0,3.0,4.0
1,2.0,2.0,5.0,7.0,
2,3.0,4.0,8.0,11.0,9.0


In [162]:
s4

Unnamed: 0,0,1,2,3,4
0,1.0,,2.0,3.0,4.0
1,1.0,2.0,3.0,4.0,
2,1.0,2.0,3.0,4.0,5.0


In [164]:
s4.apply(lambda x:x.max()-x.min())    #列上的最大值减最小值

0    0.0
1    0.0
2    1.0
3    1.0
4    1.0
dtype: float64

In [163]:
help(np.cumsum)

Help on function cumsum in module numpy.core.fromnumeric:

cumsum(a, axis=None, dtype=None, out=None)
    Return the cumulative sum of the elements along a given axis.
    
    Parameters
    ----------
    a : array_like
        Input array.
    axis : int, optional
        Axis along which the cumulative sum is computed. The default
        (None) is to compute the cumsum over the flattened array.
    dtype : dtype, optional
        Type of the returned array and of the accumulator in which the
        elements are summed.  If `dtype` is not specified, it defaults
        to the dtype of `a`, unless `a` has an integer dtype with a
        precision less than that of the default platform integer.  In
        that case, the default platform integer is used.
    out : ndarray, optional
        Alternative output array in which to place the result. It must
        have the same shape and buffer length as the expected output
        but the type will be cast if necessary. See `doc.ufuncs`

## 制图

### 直方图

In [165]:
n1=np.random.randint(0,7,size=10)

In [167]:
n1

array([1, 0, 4, 2, 2, 4, 0, 6, 6, 2])

In [169]:
s1=pd.Series(n1)