In [2]:
import numpy as np
import pandas as pd

## 3. 산술 연산과 데이터 정렬

In [2]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

In [3]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [4]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [5]:
s1 + s2      # 한 쪽이라도 정보가 없으면 NaN 처리됨

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [7]:
df1 = pd.DataFrame(np.arange(12).reshape(3, 4), columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20).reshape(4, 5), columns = list('abcde'))
df2.loc[1, 'b'] = np.nan

In [8]:
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [9]:
df2

Unnamed: 0,a,b,c,d,e
0,0,1.0,2,3,4
1,5,,7,8,9
2,10,11.0,12,13,14
3,15,16.0,17,18,19


In [10]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [11]:
df1.add(df2, fill_value=0) # 값이 없는 원소들은 0으로 지정을 한 뒤에 add

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [3]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
            columns = list('bde'),
            index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

series = frame.iloc[0]

In [4]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [5]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [12]:
frame - series  # 2차원 - 1차원

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [13]:
series2 = pd.Series(range(3), index = ['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [14]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [16]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [17]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [18]:
frame - series3

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [23]:
frame.sub(series3, axis = 'index') # 2차원 - 1차원

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


## 4. 함수 적용과 매핑

In [24]:
frame = pd.DataFrame(np.arange(12).reshape(4,3), columns= list('bde'),
                     index = ["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [30]:
frame.sum()  # pandas에서는 numpy와 다르게 axis = 0이 기본값으로 지정

b    18
d    22
e    26
dtype: int64

In [31]:
frame.sum(axis = 1)

Utah       3
Ohio      12
Texas     21
Oregon    30
dtype: int64

In [32]:
frame.sum(axis = 0)

b    18
d    22
e    26
dtype: int64

In [34]:
# 컬럼명 합계
frame.sum(axis = 'columns')

Utah       3
Ohio      12
Texas     21
Oregon    30
dtype: int64

In [37]:
# 행별 합계
np.sum(frame, axis = 1) # numpy도 사용 가능

b    18
d    22
e    26
dtype: int64

In [40]:
frame.min(axis = 0)

b    0
d    1
e    2
dtype: int32

In [38]:
frame.max(axis = 1)

Utah       2
Ohio       5
Texas      8
Oregon    11
dtype: int32

In [39]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [43]:
frame.max() - frame.min()

b    9
d    9
e    9
dtype: int32

In [48]:
# 위의 코드를 나만의 함수로 만들어서 적용 (동일한 결과)
def range_f(x):      # x는 frame을 행축으로 색인한 series
    return x.max() - x.min()

frame.apply(range_f)    # axis = 0 아 기본값

b    9
d    9
e    9
dtype: int64

### applymap은 모든 원소에 어떤 함수를 적용할 때

In [51]:
x = 0.45678
'%.2f' %x

'0.46'

In [53]:
def fmt(x):
    return '%.2f' %x

In [54]:
frame.applymap(fmt)   # applymap은 frame 전체 원소에 fmt 함수를 적용

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [57]:
lambda x: '%.2f' %x
frame.applymap(lambda x: '%.2f' %x)  # lambda 함수로 대체

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [59]:
frame['b'].applymap(fmt)  # series 데이터에는 applymap을 적용할 수 없고, 대신 map 함수를 적용

AttributeError: 'Series' object has no attribute 'applymap'

In [60]:
frame['b'].map(fmt)

Utah      0.00
Ohio      3.00
Texas     6.00
Oregon    9.00
Name: b, dtype: object

### Workshop

In [61]:
import seaborn as sns # 시각화 모듈

In [62]:
titanic = sns.load_dataset('titanic') # 데이터를 DataFrame으로 반환

In [63]:
type(titanic)

pandas.core.frame.DataFrame

In [64]:
# 데이터 훑어보기
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


- 두 열(age, fare)만 색인해서 DataFrame으로 만들기

In [68]:
df = titanic[['age', 'fare']]
df

Unnamed: 0,age,fare
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
886,27.0,13.0000
887,19.0,30.0000
888,,23.4500
889,26.0,30.0000


- 각 열의 최댓값과 최솟값 구하기

In [70]:
df.max(), df.min()

(age      80.0000
 fare    512.3292
 dtype: float64,
 age     0.42
 fare    0.00
 dtype: float64)

- 각 열의 "최댓값과 최솟값의 차이" 구하기 (apply) 함수 이용

In [71]:
def a(x):
    return x.max() - x.min()

In [72]:
df.apply(a)

age      79.5800
fare    512.3292
dtype: float64

- 모든 원소의 format을 소수점 두자리로 맞추기

In [78]:
def b(x):
    return '%.2f' %x

In [79]:
df.applymap(b)

Unnamed: 0,age,fare
0,22.00,7.25
1,38.00,71.28
2,26.00,7.92
3,35.00,53.10
4,35.00,8.05
...,...,...
886,27.00,13.00
887,19.00,30.00
888,,23.45
889,26.00,30.00


- 누락된 값(NaN)이 있는지 Boolean value로 확인하기

In [80]:
pd.isnull(df)

Unnamed: 0,age,fare
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
886,False,False
887,False,False
888,True,False
889,False,False


## 5. 정렬과 순위

In [84]:
obj = pd.Series(np.arange(4), index = ['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int32

In [87]:
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int32

In [85]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [88]:
frame = pd.DataFrame(np.arange(8).reshape(2, 4), index = ['three', 'one'], columns = ['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [89]:
frame.sort_index(axis = 0) # 인덱스 자체를 정렬

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [90]:
frame.sort_index(axis = 1)  # column 자체를 정렬

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [91]:
frame = pd.DataFrame({'b':[4, 7, -3, 2], 'c':[0, 1, 0, 1]})
frame

Unnamed: 0,b,c
0,4,0
1,7,1
2,-3,0
3,2,1


In [98]:
frame.sort_values(by = 'b', axis = 0)

Unnamed: 0,b,c
2,-3,0
3,2,1
0,4,0
1,7,1


In [99]:
frame.sort_values(by = 'c', axis = 0)

Unnamed: 0,b,c
0,4,0
2,-3,0
1,7,1
3,2,1


In [101]:
frame.sort_values(by = 'c', axis = 0, ascending = False)

Unnamed: 0,b,c
1,7,1
3,2,1
0,4,0
2,-3,0


In [106]:
frame.sort_values(by = ['c', 'b'], axis = 0)

Unnamed: 0,b,c
2,-3,0
0,4,0
3,2,1
1,7,1


## 6. 중복색인

In [109]:
obj = pd.Series(np.arange(5), index = ['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [112]:
obj.index

Index(['a', 'a', 'b', 'b', 'c'], dtype='object')

In [114]:
obj.index.is_unique

False

In [115]:
obj['a']

a    0
a    1
dtype: int32

In [116]:
obj.b

b    2
b    3
dtype: int32

In [117]:
obj.c

4

In [118]:
df = pd.DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,0.658454,1.668506,0.953461
a,-0.064832,-1.356557,-1.50632
b,0.476185,0.201552,0.552198
b,-1.585303,-1.826596,-1.828879


In [121]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.476185,0.201552,0.552198
b,-1.585303,-1.826596,-1.828879


## 7. 기술 통계 계산과 요약

In [122]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [123]:
df.sum() # axis = 0 기본값

one    9.25
two   -5.80
dtype: float64

In [124]:
df.sum(axis = 0, skipna = True) # skipna = True 기본값

one    9.25
two   -5.80
dtype: float64

In [125]:
df.sum(axis = 0, skipna = False)

one   NaN
two   NaN
dtype: float64

In [126]:
df.sum(axis = 1, skipna = False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [127]:
# numpy: 최댓값 / 최솟값의 위치를 구할 때 argmax(), argmin()
# pandas: 최댓값과 최솟값 위치를 구할 때 idxmax(), idxmin()

In [128]:
df.idxmax()

one    b
two    d
dtype: object

In [129]:
df.idxmin()   # 열축 최댓값의 인덱스

one    d
two    b
dtype: object

In [130]:
df.cumsum() # axis = 0이 default   # 열의 누적합 계산

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [132]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [131]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [134]:
obj = pd.Series(['a', 'a', 'b', 'b'] * 4)
obj

0     a
1     a
2     b
3     b
4     a
5     a
6     b
7     b
8     a
9     a
10    b
11    b
12    a
13    a
14    b
15    b
dtype: object

In [135]:
obj.describe()  # 문자열을 출력하면 다른 요소가 나옴

count     16
unique     2
top        a
freq       8
dtype: object

In [136]:
titanic.describe() # 숫자에 관한 정보만 나옴

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [139]:
titanic.describe(include = 'all')  # 문자열 정보까지 원할 때

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889,891,891,891,203,889,891,891
unique,,,2,,,,,3,3,3,2,7,3,2,2
top,,,male,,,,,S,Third,man,True,C,Southampton,no,True
freq,,,577,,,,,644,491,537,537,59,644,549,537
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,,,,,,,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,,,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,,,,,,,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,,,,,,,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,,,,,,,,


### 7.1 상관관계와 공분산

In [None]:
df = pd.DataFrame({"math" : [50, 60, 40, 30, 70, 50], "physics" : [40, 60, 50, 20, 80, 50]})
df

Unnamed: 0,math,physics
0,50,40
1,60,60
2,40,50
3,30,20
4,70,80
5,50,50
