**5.2.6 함수 적용과 매핑**

In [21]:
import numpy as np
import pandas as pd

In [22]:
frame = pd.DataFrame(np.random.randn(4,3), columns=list('bde'), index=['Utah','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.822311,1.096857,0.346002
Ohio,0.547293,0.598301,-0.434436
Texas,-0.845517,-0.004694,-0.23987
Oregon,0.634424,0.755357,0.336344


In [23]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.822311,1.096857,0.346002
Ohio,0.547293,0.598301,0.434436
Texas,0.845517,0.004694,0.23987
Oregon,0.634424,0.755357,0.336344


In [24]:
f = lambda x: x.max() - x.min()     #lambda: 한줄로 함수 정의하는 법

In [25]:
frame.apply(f)

b    1.479941
d    1.101551
e    0.780438
dtype: float64

In [26]:
frame.apply(f, axis=1)

Utah      1.919168
Ohio      1.032737
Texas     0.840824
Oregon    0.419012
dtype: float64

In [27]:
def f(x):
  return pd.Series([x.min(),x.max()], index=['min','max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-0.845517,-0.004694,-0.434436
max,0.634424,1.096857,0.346002


In [28]:
format = lambda x: '%.2f' %x
frame.applymap(format)          #각 원소에 적용할 함수를 지정하기 위한 map

Unnamed: 0,b,d,e
Utah,-0.82,1.1,0.35
Ohio,0.55,0.6,-0.43
Texas,-0.85,-0.0,-0.24
Oregon,0.63,0.76,0.34


In [29]:
frame['e'].map(format)

Utah       0.35
Ohio      -0.43
Texas     -0.24
Oregon     0.34
Name: e, dtype: object

**5.2.7 정렬과 순위**

In [30]:
obj = pd.Series(range(4), index=['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [31]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index=['three', 'one'], columns=['d','a','b','c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [32]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [33]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [34]:
frame.sort_index(axis=1, ascending=False)     # 기본: 오름차순

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [35]:
obj = pd.Series([4,7,-3,2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [36]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [37]:
frame = pd.DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [38]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [41]:
frame.sort_values(by=['a','b'])     #정렬 1순위: a, 2순위: b (a정렬에서 같은 것이 있을 때, b의 순위대로 정렬)

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [42]:
obj = pd.Series([7,-5,7,4,2,0,4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [43]:
obj.rank(method='first')      #동률일 때, 먼저 나타나는 순서로 정렬

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [44]:
obj.rank(ascending=False, method='max')       #동률일 때, 가장 큰 순서로 정렬

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [45]:
frame = pd.DataFrame({'b':[4.3,7,-3,2], 'a':[0,1,0,1], 'c':[-2,5,8,-2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [48]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


**5.2.8 중복 색인**

In [49]:
obj = pd.Series(range(5), index=['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [50]:
obj.index.is_unique     #해당 값이 유일한지 아닌지를 알려줌.

False

In [51]:
obj['a']      #하나의 Series객체 반환

a    0
a    1
dtype: int64

In [52]:
obj['c']      #스칼라값 반환

4

In [54]:
df = pd.DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,0.434286,-0.201398,-0.51022
a,-0.245374,-1.23108,-0.151327
b,-0.77004,0.822876,-0.544771
b,1.363848,-1.836202,0.074353


In [55]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.77004,0.822876,-0.544771
b,1.363848,-1.836202,0.074353


**5.3 기술 통계 계산과 요약**

In [56]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
                  index = ['a','b','c','d'],
                  columns = ['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [57]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [58]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [63]:
df.mean(axis='columns',skipna=False)      #skipna: NA 계산을 제외할지 정함(default는 제외함)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [66]:
df.idxmax()

one    b
two    d
dtype: object

In [69]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [70]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [71]:
obj = pd.Series(['a','a','b','c']*4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

**5.3.1 상관관계와 공분산**

In [76]:
import pandas_datareader

In [None]:
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL','IBM','MSFT','GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})

**5.3.2 유일값, 값 세기, 멤버십**

In [80]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [87]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [88]:
pd.value_counts(obj.values, sort=False)

d    1
a    3
c    3
b    2
dtype: int64

In [89]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [92]:
mask = obj.isin(['b','c'])

In [93]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [96]:
to_match = pd.Series(['c','a','b','b','c','a'])
unique_vals = pd.Series(['c','b','a'])

pd.Index(unique_vals).get_indexer(to_match)     #to_match와 공통되는 unique_vals 값 인데스값 찾기

array([0, 2, 1, 1, 0, 2])

In [97]:
data = pd.DataFrame({'Qu1':[1,3,4,3,4], 'Qu2':[2,3,1,2,3], 'Qu3':[1,5,2,4,4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [99]:
result = data.apply(pd.value_counts).fillna(0)
result       #결괏값의 로우 라벨은 전체 컬럼의 유일한 값들을 담고 있다.
             #각 값은 각 컬럼에서 해당 값이 몇 번 출현했는지 나타낸다.

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
