## map
#### 함수와 sequence를 인자로 받아서 각 element마다 입력받은 함수를 적용하여 list로 반환한다.
#### 인자가 될 함수는 주로 lambda 형태로 사용한다.
#### map(function, sequence)

In [5]:
ex = [1,2,3,4,5]
f = lambda x:x **2
list(map(f,ex))

[1, 4, 9, 16, 25]

In [6]:
f = lambda x , y: x + y

In [7]:
list(map(f, ex, ex))
#함수의 argument가 두 개면, sequence도 두 개 넣는다.

[2, 4, 6, 8, 10]

In [8]:
list(map(lambda x:x+x, ex))

[2, 4, 6, 8, 10]

## map for series
#### series type의 데이터에도 map 함수 사용 가능.
#### function 대신 dict, sequence로 대체 가능.

In [14]:
from pandas import Series
import numpy as np

s1 = Series(np.arange(10))
s1

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [15]:
s1.map(lambda x: x**2)

0     0
1     1
2     4
3     9
4    16
5    25
6    36
7    49
8    64
9    81
dtype: int64

In [16]:
z = {1: 'A', 2: 'B', 3: 'C'}
s1.map(z)

0    NaN
1      A
2      B
3      C
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: object

In [17]:
s2 = Series(np.arange(10,20))
s1.map(s2)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

In [19]:
import pandas as pd

df = pd.read_csv('./data/wages.csv')
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


#### 성별 str -> 성별 code

In [20]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [22]:
df['sex_code'] = df.sex.map({'male':0, 'female':1})
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,male,white,16,49,0
1,96396.988643,66.23,female,white,16,62,1
2,48710.666947,63.77,female,white,16,33,1
3,80478.096153,63.22,female,other,16,95,1
4,82089.345498,63.08,female,white,17,43,1


In [29]:
df.drop('sex',axis=1).head()
#drop 대신 del df['sex'] 가능.

Unnamed: 0,earn,height,race,ed,age,sex_code
0,79571.299011,73.89,white,16,49,0
1,96396.988643,66.23,white,16,62,1
2,48710.666947,63.77,white,16,33,1
3,80478.096153,63.22,other,16,95,1
4,82089.345498,63.08,white,17,43,1


## replace function

In [26]:
df.sex.replace({'male' : 0, 'female' : 1}).head() #map 없이 적용 가능.

0    0
1    1
2    1
3    1
4    1
Name: sex, dtype: int64

In [28]:
df.sex.replace(['male','female'], [0,1], inplace=True)
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,0,white,16,49,0
1,96396.988643,66.23,1,white,16,62,1
2,48710.666947,63.77,1,white,16,33,1
3,80478.096153,63.22,1,other,16,95,1
4,82089.345498,63.08,1,white,17,43,1


## apply

#### map과 달리 하나의 series 전체에 대하여 해당 함수를 적용시킨다.
#### 입력 값을 series 데이터로 입력받아 handling 할 수 있다.
#### 전체 자료의 통계값을 추출할 때 유용

In [36]:
df = pd.read_csv('./data/wages.csv')
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [39]:
df_info = df[['earn','height','age']]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.299011,73.89,49
1,96396.988643,66.23,62
2,48710.666947,63.77,33
3,80478.096153,63.22,95
4,82089.345498,63.08,43


In [40]:
f = lambda x : x.max() - x.min()
df_info.apply(f)

earn      318047.708444
height        19.870000
age           73.000000
dtype: float64

In [41]:
df_info.sum()
# df_info.apply(sum) 같은 기능.

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

In [45]:
df_info.apply(sum)

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

#### scalar 값 외에 series 값의 반환도 가능하다.

In [47]:
def f(x):
    return Series([x.min(), x.max(), x.mean()], index = ['min','max','mean'])
df_info.apply(f)

Unnamed: 0,earn,height,age
min,-98.580489,57.34,22.0
max,317949.127955,77.21,95.0
mean,32446.292622,66.59264,45.328499


In [48]:
df_info.apply(sum, axis=1)

0        79694.189011
1        96525.218643
2        48807.436947
3        80636.316153
4        82195.425498
5        15407.882901
6        47218.711821
7        51083.344282
8         3309.889556
9        43099.037884
10       10467.838843
11        1119.457155
12       47703.929864
13       19118.622299
14       20183.856639
15        1083.892346
16       36077.181123
17       27060.613964
18       64703.223972
19       70111.713070
20        1092.298306
21       12255.022115
22       84335.157919
23        9066.644935
24       23366.363278
25        8842.809185
26       64704.549805
27       54200.504945
28       16997.860044
29          -2.250279
            ...      
1349     64736.641872
1350     16047.423155
1351     25468.928379
1352     18601.227184
1353     26547.776457
1354     28745.820098
1355     20158.062736
1356      6449.932127
1357     95537.526798
1358     68547.843058
1359     50404.021292
1360     80629.389376
1361     44021.074343
1362     47842.290935
1363     1

## applymap

#### series 단위가 아닌 element 단위로 함수를 적용함
#### series 단위에 apply를 적용시킬 때와 같은 효과

In [49]:
f = lambda x : -x
df_info.applymap(f).head(5)

Unnamed: 0,earn,height,age
0,-79571.299011,-73.89,-49
1,-96396.988643,-66.23,-62
2,-48710.666947,-63.77,-33
3,-80478.096153,-63.22,-95
4,-82089.345498,-63.08,-43


In [50]:
f = lambda x : -x
df_info['earn'].apply(f).head()

0   -79571.299011
1   -96396.988643
2   -48710.666947
3   -80478.096153
4   -82089.345498
Name: earn, dtype: float64