### map方法
>Series 的map方法接受一个函数，或者一个字典，包含映射关系，尤其是**字典**的关系

In [27]:
import pandas as pd
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [28]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### 离散化和面元划分

In [29]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats


[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [30]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [31]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [32]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [33]:
# 你可以总一个list或者labels选项来设定bin的名字
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels = group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [34]:
import numpy as np
data = np.random.rand(20)
pd.cut(data,4,precision =3) # 保留小数点后2位

[(0.727, 0.966], (0.727, 0.966], (0.25, 0.488], (0.488, 0.727], (0.25, 0.488], ..., (0.0101, 0.25], (0.0101, 0.25], (0.488, 0.727], (0.0101, 0.25], (0.0101, 0.25]]
Length: 20
Categories (4, interval[float64]): [(0.0101, 0.25] < (0.25, 0.488] < (0.488, 0.727] < (0.727, 0.966]]

### 检测和过滤异常值

In [35]:
data1 = pd.DataFrame(np.random.randn(1000,4))
data1.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.002359,-0.035522,0.028019,0.010239
std,1.00738,1.035966,1.013594,0.968946
min,-3.611504,-2.894222,-2.949181,-3.103279
25%,-0.699877,-0.738053,-0.652212,-0.651532
50%,0.021345,-0.070121,0.009037,0.048946
75%,0.670352,0.688754,0.719881,0.674644
max,3.076633,2.988169,3.341183,3.02225


In [36]:
col = data1[2]
col[np.abs(col)>3]

539    3.042139
624    3.091267
872    3.341183
Name: 2, dtype: float64

In [37]:
data1[(np.abs(data1) >3).any(1)]

Unnamed: 0,0,1,2,3
152,0.559975,-0.384209,0.024959,-3.103279
285,3.076633,-0.635903,0.980109,1.333948
529,-3.611504,-1.115725,-0.770162,0.146386
539,2.092839,-0.183407,3.042139,0.983883
624,-0.343124,-2.449193,3.091267,-1.092982
872,1.231811,-0.093561,3.341183,-1.227341
925,1.449715,1.422769,0.849756,3.02225
