## 06-1 간단한 함수 만들기

#### [Do It! 실습] 사용자 함수 만들기

In [29]:
def my_sq(x):
    return x ** 2

In [30]:
def avg_2(x, y):
    return (x + y) / 2

- 독스트링 : docstring -> 함수 도움말

In [31]:
def avg_2(x, y):
    """두 숫자의 평균을 구하는 함수
    """
    return (x + y) / 2

In [32]:
my_calc_1 = my_sq(4)
print(my_calc_1)

16


In [33]:
my_calc_2 = avg_2(10, 20)
print(my_calc_2)

15.0


## 06-2 apply() 메서드 사용하기

- series 에 적용되는 apply()
  - 연산 방향은 1 방향
- dataframe 에 적용되는 apply()
  - axis 방향을 지정할 수 있음

- func : 사용할 함수명
- **kwds : 키워드로 지정해서 넘길 수 있는 함수의 매개변수, 복수 전달 가능
  - .items()
  - .keys()


In [34]:
def test1(**kwds):
    for key, value in kwds.items():
        print(key, value)

test1(key1 = 3, key2 = 4)

key1 3
key2 4


- *args : 값만 지정해서 넘길 수 있는 함수의 매개변수, 복수 전달 가능

In [35]:
def test2(*args):
    for value in args:
        print(value, '\n')

test2('Tom', 'Can')

Tom 

Can 



#### [Do It! 실습] 데이터프레임에 함수 적용하기

In [36]:
import pandas as pd

df = pd.DataFrame({"a": [10, 20, 30], 
                   "b": [20, 30, 40]})
print(df)

    a   b
0  10  20
1  20  30
2  30  40


In [37]:
print(df['a'] ** 2)

0    100
1    400
2    900
Name: a, dtype: int64


### 시리즈에 함수 적용하기

#### [Do It! 실습] 시리즈에 함수 적용하기

In [38]:
print(type(df['a']))

<class 'pandas.core.series.Series'>


In [39]:
print(type(df.iloc[0]))

<class 'pandas.core.series.Series'>


In [40]:
sq = df['a'].apply(my_sq)
print(sq)

0    100
1    400
2    900
Name: a, dtype: int64


#### [Do It! 실습] 사용자 함수 만들어 데이터프레임에 적용하기

In [41]:
def my_exp(x, e):
    return x ** e

In [42]:
cubed = my_exp(2, 3)

print(cubed)

8


In [43]:
# my_exp(2)  # 오류

In [44]:
ex = df['a'].apply(my_exp, e=2)
print(ex)

0    100
1    400
2    900
Name: a, dtype: int64


In [45]:
ex = df['a'].apply(my_exp, e=3)
print(ex)

0     1000
1     8000
2    27000
Name: a, dtype: int64


### 데이터프레임에 함수 적용하기

In [46]:
df = pd.DataFrame({"a": [10, 20, 30], 
                   "b": [20, 30, 40]})
print(df)

    a   b
0  10  20
1  20  30
2  30  40


In [47]:
def print_me(x):
    print(x)

#### [Do It! 실습] 열 단위로 함수 적용하기

In [48]:
df.apply(print_me, axis=0)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [49]:
print(df['a'])

0    10
1    20
2    30
Name: a, dtype: int64


In [50]:
print(df['b'])

0    20
1    30
2    40
Name: b, dtype: int64


In [51]:
def avg_3(x, y, z):
    return (x + y + z) / 3

In [52]:
# print(df.apply(avg_3))  # 오류

In [53]:
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3

In [54]:
print(df.apply(avg_3_apply))

a    20.0
b    30.0
dtype: float64


#### [Do It! 실습] 행 단위로 함수 적용하기

In [55]:
# print(df.apply(avg_3_apply, axis=1))  # 오류

In [56]:
def avg_2_apply(row):
    x = row[0]
    y = row[1]
    return (x + y) / 2

In [57]:
print(df.apply(avg_2_apply, axis=0))

a    15.0
b    25.0
dtype: float64


## 06-3 람다 함수 사용하기 

#### [Do It! 실습] 데이터프레임에 람다 함수 사용하기

In [58]:
df = pd.DataFrame({'a': [10, 20, 30],
                   'b': [20, 30, 40]})
print(df)

    a   b
0  10  20
1  20  30
2  30  40


In [59]:
def my_sq(x):
    return x ** 2

df['a_sq'] = df['a'].apply(my_sq)
print(df)

    a   b  a_sq
0  10  20   100
1  20  30   400
2  30  40   900


In [60]:
df['a_sq_lamb'] = df['a'].apply(lambda x: x ** 2)
print(df)

    a   b  a_sq  a_sq_lamb
0  10  20   100        100
1  20  30   400        400
2  30  40   900        900


- lambda 로 불명확한 코드(짧은 변수명) 작성은 후에 판독하기 어려워짐

In [61]:
df.apply(lambda x : print(x[0], x[1]), axis = 1)

10 20
20 30
30 40


  df.apply(lambda x : print(x[0], x[1]), axis = 1)


0    None
1    None
2    None
dtype: object

In [62]:
df.apply(lambda x : print(x[0], x[1], x[2]), axis = 0)

10 20 30
20 30 40
100 400 900
100 400 900


a            None
b            None
a_sq         None
a_sq_lamb    None
dtype: object

## 06-4 벡터화된 함수 사용하기

#### [Do It! 실습] 벡터화된 함수 사용하기

In [63]:
df = pd.DataFrame({"a": [10, 20, 30], 
                   "b": [20, 30, 40]})
print(df)

    a   b
0  10  20
1  20  30
2  30  40


In [64]:
def avg_2(x, y):
    return (x + y) / 2

In [65]:
print(avg_2(df['a'], df['b']))

0    15.0
1    25.0
2    35.0
dtype: float64


In [66]:
import numpy as np

def avg_2_mod(x, y):
    if (x == 20):
        return(np.NaN)
    else:
        return (x + y) / 2

In [67]:
# print(avg_2_mod(df['a'], df['b']))  # 오류

In [68]:
print(avg_2_mod(10, 20))

15.0


In [69]:
print(avg_2_mod(20, 30))

nan


### 넘파이와 넘바로 벡터화하기

#### [Do It! 실습] 넘파이로 벡터화하기

In [70]:
import numpy as np

avg_2_mod_vec = np.vectorize(avg_2_mod)

In [71]:
print(avg_2_mod_vec(df['a'], df['b']))

[15. nan 35.]


In [72]:
@np.vectorize
def v_avg_2_mod(x, y):
    if (x == 20):
        return(np.NaN)
    else:
        return (x + y) / 2

print(v_avg_2_mod(df['a'], df['b']))

[15. nan 35.]


#### [Do It! 실습] 넘바로 벡터화하기

In [73]:
import numba

@numba.vectorize
def v_avg_2_numba(x, y):
    if (int(x) == 20):
        return(np.NaN)
    else:
        return (x + y) / 2

ModuleNotFoundError: No module named 'numba'

In [None]:
# print(v_avg_2_numba(df['a'], df['b']))  # 오류

In [None]:
print(v_avg_2_numba(df['a'].values, df['b'].values))

# pydataset을 이용한 pivot 연습

In [25]:
!pip install pydataset

Collecting pydataset
  Downloading pydataset-0.2.0.tar.gz (15.9 MB)
     ---------------------------------------- 0.0/15.9 MB ? eta -:--:--
     - -------------------------------------- 0.7/15.9 MB 14.2 MB/s eta 0:00:02
     ----- ---------------------------------- 2.0/15.9 MB 21.6 MB/s eta 0:00:01
     ---------- ----------------------------- 4.2/15.9 MB 30.0 MB/s eta 0:00:01
     ------------- -------------------------- 5.2/15.9 MB 33.5 MB/s eta 0:00:01
     ------------- -------------------------- 5.2/15.9 MB 33.5 MB/s eta 0:00:01
     ------------- -------------------------- 5.2/15.9 MB 33.5 MB/s eta 0:00:01
     --------------- ------------------------ 6.3/15.9 MB 20.1 MB/s eta 0:00:01
     -------------------- ------------------- 8.3/15.9 MB 22.2 MB/s eta 0:00:01
     ----------------------------- --------- 12.1/15.9 MB 29.7 MB/s eta 0:00:01
     --------------------------------------  15.9/15.9 MB 65.6 MB/s eta 0:00:01
     --------------------------------------- 15.9/15.9 MB 59

In [26]:
import pydataset

initiated datasets repo at: C:\Users\asiae\.pydataset/


In [27]:
tips = pydataset.data('tips')

In [28]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [74]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [75]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [76]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [78]:
tips.groupby(['day', 'smoker']).tip_pct.mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [79]:
tips.groupby(['day', 'smoker'])['tip_pct'].mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [83]:
tips.pivot_table(index=['time', 'day'],
                columns = 'smoker',
                values = ['tip_pct', 'size'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [84]:
pd.pivot_table(tips, index=['time', 'day'],
                columns = 'smoker',
                values = ['tip_pct', 'size'],
                aggfunc=sum)

  pd.pivot_table(tips, index=['time', 'day'],


Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,6.0,20.0,0.418867,1.488126
Dinner,Sat,115.0,104.0,7.112145,6.212055
Dinner,Sun,167.0,49.0,9.126438,3.557756
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,11.0,0.187735,1.13362
Lunch,Thur,110.0,40.0,7.053669,2.785676


In [85]:
pd.pivot_table(tips, index=['time', 'day'],
                columns = 'smoker',
                values = ['tip_pct', 'size'],
                aggfunc = sum,
                margins = True)

  pd.pivot_table(tips, index=['time', 'day'],
  pd.pivot_table(tips, index=['time', 'day'],
  pd.pivot_table(tips, index=['time', 'day'],


Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,6.0,20.0,26,0.418867,1.488126,1.906993
Dinner,Sat,115.0,104.0,219,7.112145,6.212055,13.324199
Dinner,Sun,167.0,49.0,216,9.126438,3.557756,12.684194
Dinner,Thur,2.0,,2,0.159744,,0.159744
Lunch,Fri,3.0,11.0,14,0.187735,1.13362,1.321354
Lunch,Thur,110.0,40.0,150,7.053669,2.785676,9.839345
All,,403.0,224.0,627,24.058598,15.177232,39.23583


In [86]:
pd.pivot_table(tips, index=['time', 'day'],
                columns = 'smoker',
                values = ['tip_pct', 'size'],
                aggfunc = lambda x : x.max() - x.min(),
                margins = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,0.0,2.0,2,0.035239,0.159925,0.159925
Dinner,Sat,3.0,4.0,4,0.235193,0.290095,0.290095
Dinner,Sun,4.0,3.0,4,0.193226,0.644685,0.650898
Dinner,Thur,0.0,,0,0.0,,0.0
Lunch,Fri,0.0,1.0,2,0.0,0.14158,0.14158
Lunch,Thur,5.0,2.0,5,0.19335,0.15124,0.19335
All,,5.0,4.0,5,0.235193,0.674707,0.674707


In [87]:
def custom_add(x):
    return x.sum()

pd.pivot_table(tips, index=['time', 'day'],
                columns = 'smoker',
                values = ['tip_pct', 'size'],
                aggfunc = custom_add,
                margins = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,6.0,20.0,26,0.418867,1.488126,1.906993
Dinner,Sat,115.0,104.0,219,7.112145,6.212055,13.324199
Dinner,Sun,167.0,49.0,216,9.126438,3.557756,12.684194
Dinner,Thur,2.0,,2,0.159744,,0.159744
Lunch,Fri,3.0,11.0,14,0.187735,1.13362,1.321354
Lunch,Thur,110.0,40.0,150,7.053669,2.785676,9.839345
All,,403.0,224.0,627,24.058598,15.177232,39.23583
