In [1]:
import pandas as pd
import numpy as np

In [17]:
obj = pd.Series(range(4), index = ['d','a','b','c'])

In [3]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [8]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

### 데이터프레임 Sort 해보기

In [12]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
                     index = ['three', 'one'],
                     columns = ['d', 'a', 'b', 'c'])

In [13]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [16]:
# 데이터프레임 칼럼들 sort 하기! sort_index 사용
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [18]:
# sort하는데 ascending이 아니라 descending !
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


### value sort 하기 

In [24]:
# 이번에는 value를 sort해보자!!!!
obj = pd.Series([4,7,-3,2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [25]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [40]:
obj.sort_values(ascending=False)

2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64

## Sort Key로 한 칼럼을 이용할 수 있다 ! 

In [65]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [66]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [67]:
frame = pd.DataFrame({'b' : [4,7,-3,2], 'a' : [0,1,0,1]})

In [68]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [69]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


## 이렇게 두 칼럼으로도 sorting 가능함 ! 

In [73]:
frame.sort_values(by=['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [74]:
frame.sort_values(by=['b','a'])

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [76]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


## Rank로 정렬해보기

In [91]:
frame = pd.DataFrame({'b' : [4.3,7,-3,2], 'a' : [0,1,0,1],
                     'c' : [-2,5,8,-2.5]})

In [92]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [95]:
# 이거는 가로끼리 비교임! 그리고 ascending은 기본적으로 True임
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## 중복 index면 어떻게 할까 ??????
 - data selection은 duplicate 여부에 따라서 결과가 달라지므로 중복은 중요함

In [154]:
obj = pd.Series(range(5), index=['a','a','a','b','b'])
obj

a    0
a    1
a    2
b    3
b    4
dtype: int64

In [155]:
obj.index.is_unique

False

In [156]:
obj['a']

a    0
a    1
a    2
dtype: int64

## DataFrame의 계산 기능

In [184]:
df =pd.DataFrame([[1.4,np.nan], [7.1,-4.5],
                [np.nan,np.nan], [0.75,-1.3]],
                index = ['a','b','c','d'],
                columns = ['one','two'])

In [185]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [186]:
# sum 하면 Column의 Sum임!
df.sum()

one    9.25
two   -5.80
dtype: float64

In [187]:
# 이번엔 가로로!
df.sum(axis='columns')
df.sum(axis=1) # 같은것

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [188]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


## skipna : true나 false. 

In [192]:
df.mean(axis='columns',skipna=True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [193]:
df.mean(axis='columns',skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [194]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [200]:
df.max()

one    7.1
two   -1.3
dtype: float64

In [223]:
# 이런식으로도 활용 가능 
df.max(skipna=False,axis=1)

a     NaN
b    7.10
c     NaN
d    0.75
dtype: float64

## 누적 

In [241]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


## 기타 Method 들 

In [246]:
obj = pd.Series(['a','a','b','c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [248]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [258]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [257]:
# NA값이 아닌 것 세기
df.count()

one    3
two    2
dtype: int64

In [262]:
# axis 조절 가능 ! 
df.max()

one    7.1
two   -1.3
dtype: float64

In [279]:
# 인덱스로 미니멈/맥시멈 값 알려주기 
df.idxmin()

one    d
two    b
dtype: object

 - 이밖에도 
 - quantile : 사분위수로 알려주고
 - sum, mean, median
 - mad : mean absolute variation. 
 - prod : product of all value 등 
 - var, std, skew(대칭도), kurt(첨도), cumsum(cumulative sum), diff(

In [310]:
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [316]:
price = pd.DataFrame({ticker : data['Adj Close'] for ticker,data in all_data.items()})
volume = pd.DataFrame({ticker : data['Volume'] for ticker , data in all_data.items()})

In [317]:
price

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-18,26.945841,108.215790,49.268490,740.000000
2015-11-19,27.288149,108.948814,49.350838,738.409973
2015-11-20,27.407616,110.351089,49.579559,756.599976
2015-11-23,27.051517,110.319260,49.579559,755.979980
2015-11-24,27.311123,110.430771,49.634457,748.280029
...,...,...,...,...
2020-11-10,115.970001,117.910004,211.009995,1740.390015
2020-11-11,119.489998,117.199997,216.550003,1752.709961
2020-11-12,119.209999,114.500000,215.440002,1749.839966
2020-11-13,119.260002,116.849998,216.509995,1777.020020


In [318]:
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-18,186698800.0,4149200.0,29710000.0,1684300
2015-11-19,173183200.0,4753600.0,28149200.0,1327100
2015-11-20,137148400.0,5176400.0,37147600.0,2212300
2015-11-23,129930000.0,5137900.0,28235900.0,1414500
2015-11-24,171212800.0,3407700.0,24600000.0,2333100
...,...,...,...,...
2020-11-10,138023400.0,5622800.0,44045100.0,2636100
2020-11-11,112295000.0,4289600.0,29440800.0,1264000
2020-11-12,103162300.0,6498200.0,21593900.0,1247500
2020-11-13,81581900.0,4682600.0,18621100.0,1499900


In [319]:
returns = price.pct_change()

In [320]:
returns

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-18,,,,
2015-11-19,0.012704,0.006774,0.001671,-0.002149
2015-11-20,0.004378,0.012871,0.004635,0.024634
2015-11-23,-0.012993,-0.000288,0.000000,-0.000819
2015-11-24,0.009597,0.001011,0.001107,-0.010185
...,...,...,...,...
2020-11-10,-0.003009,0.020601,-0.033793,-0.012825
2020-11-11,0.030353,-0.006022,0.026255,0.007079
2020-11-12,-0.002343,-0.023038,-0.005126,-0.001637
2020-11-13,0.000419,0.020524,0.004967,0.015533


In [321]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-10,-0.003009,0.020601,-0.033793,-0.012825
2020-11-11,0.030353,-0.006022,0.026255,0.007079
2020-11-12,-0.002343,-0.023038,-0.005126,-0.001637
2020-11-13,0.000419,0.020524,0.004967,0.015533
2020-11-16,0.00872,0.012923,0.003325,0.002454
