# 10 minutes to pandas
https://pandas.pydata.org/pandas-docs/version/1.0.0/getting_started/10min.html

In [1]:
import numpy as np
import pandas as pd

## Object Creation

리스트 값을 이용해서 시리즈를 마는다.

인덱스의 기본값은 정수형이다.

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

넘파이 배열을 통해서 데이터프레임을 생성한다.

In [4]:
dates = pd.date_range('20130101', periods=6) # 날짜 범위, 1부터 기간이 늘어남
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

## 문자열을 리스트에 넣으면 각각 끊어져서 들어간다.

In [25]:
# 6행 4열의 랜덤한 실수값인데 값의 범위 기준은 평균0 표준편차1 인가?
# 인덱스는 위에서 만든 리스트의 값을 반환한다.
# 컬럼은 문자열을 이어 썼을 때, 각각 나눠서 컬럼에 반환한다. 문자열을 리스트에 넣으면
# 각각 끊어져서 들어감
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.569912,0.611761,1.011409,0.518493
2013-01-02,0.65477,1.516108,-1.589633,0.326186
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468
2013-01-04,1.727305,-0.314997,1.035923,-1.770209
2013-01-05,-0.930456,0.149301,0.176194,-1.258266
2013-01-06,-1.315444,0.764969,-0.05425,0.810078


딕셔너리 구조로 데이터프레임 생성

컬럼에 들어가는 값들이 다양한 방식으로 만들어졌다.

In [16]:
df2 = pd.DataFrame({
    'A': 1., # 실수형
    'B': pd.Timestamp('20130102'), # 날짜형
    'C': pd.Series(1, index=list(range(4)), dtype='float32'), # 시리즈형(인덱스지정)
    'D': np.array([3] * 4, dtype='int32'), # array에 3을 4번 넣어라
    'E': pd.Categorical(['test', 'train', 'test', 'train']), # 그냥 리스트로 넣어도 될텐데?
    'F': 'foo' # 문자형
})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [18]:
# 위의 컬럼별 데이터 형태가 다름
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing data

데이터프레임을 넘파이배열로 변경시키는 방법이다.

데이터프레임은 컬럼당 dtype을 갖지만,

넘파이는 전체가 동일한 dtype을 가진다는 차이가 있다.

**인덱스나 컬럼을 반환하지는 않는다**

In [31]:
# df가 실수일때 변경하면?
df.to_numpy()

array([[ 1.56991202,  0.61176062,  1.01140949,  0.51849335],
       [ 0.65477022,  1.51610789, -1.58963311,  0.32618598],
       [-0.38901918, -0.00614579, -0.11651777, -0.97246766],
       [ 1.72730525, -0.31499736,  1.0359226 , -1.77020944],
       [-0.93045601,  0.14930085,  0.17619382, -1.25826646],
       [-1.31544395,  0.76496907, -0.05424963,  0.81007776]])

In [32]:
# df가 혼합되어있을 때 변경하면?
df2.to_numpy()

# dtype를 모두 object로 바꾼다.

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [39]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.569912,0.611761,1.011409,0.518493
2013-01-02,0.65477,1.516108,-1.589633,0.326186
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468
2013-01-04,1.727305,-0.314997,1.035923,-1.770209
2013-01-05,-0.930456,0.149301,0.176194,-1.258266
2013-01-06,-1.315444,0.764969,-0.05425,0.810078


In [44]:
# 인덱스나 컬럼 기준으로 솔팅을 할 수 있다. value기준이 아님.
# value는 sort_values일 것.
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.518493,1.011409,0.611761,1.569912
2013-01-02,0.326186,-1.589633,1.516108,0.65477
2013-01-03,-0.972468,-0.116518,-0.006146,-0.389019
2013-01-04,-1.770209,1.035923,-0.314997,1.727305
2013-01-05,-1.258266,0.176194,0.149301,-0.930456
2013-01-06,0.810078,-0.05425,0.764969,-1.315444


In [45]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,1.727305,-0.314997,1.035923,-1.770209
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468
2013-01-05,-0.930456,0.149301,0.176194,-1.258266
2013-01-01,1.569912,0.611761,1.011409,0.518493
2013-01-06,-1.315444,0.764969,-0.05425,0.810078
2013-01-02,0.65477,1.516108,-1.589633,0.326186


## Selection

.at, .iat, .loc, .iloc에 관련된 내용

보통 숫자로 슬라이싱 하면 iloc을 쓸 것 같지만,

사용의 편의성을 위해서인지 바로 []에 슬라이싱을 활용 가능하도록 설계

In [48]:
# iloc를 사용하지 않고 숫자로 인덱싱
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.569912,0.611761,1.011409,0.518493
2013-01-02,0.65477,1.516108,-1.589633,0.326186
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468


In [50]:
# 날짜형 순서대로
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.65477,1.516108,-1.589633,0.326186
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468
2013-01-04,1.727305,-0.314997,1.035923,-1.770209


## Selection by label

In [59]:
df.loc['20130102', ['A', 'B']].to_frame().T

Unnamed: 0,A,B
2013-01-02,0.65477,1.516108


In [60]:
df.loc[dates[0], 'A']

1.569912018160895

특정 스칼라에 빠르게 접근하고자 하면 loc을 통해서 행, 렬을 입력하는 것보다
.at 을 통해서 동일하게 입력하는 방법이 있다.

In [61]:
df.at[dates[0], 'A']

1.569912018160895

## Selection by position

In [62]:
df.iloc[3]

A    1.727305
B   -0.314997
C    1.035923
D   -1.770209
Name: 2013-01-04 00:00:00, dtype: float64

In [66]:
df.iloc[[1,2,4], [0,2]]
# 스타크래프트2 데이터셋 APM 비교하기 위해서 추출할 때 내가 쓰던 방법

Unnamed: 0,A,C
2013-01-02,0.65477,-1.589633
2013-01-03,-0.389019,-0.116518
2013-01-05,-0.930456,0.176194


In [69]:
df.iloc[1, 1]

1.516107885029894

In [71]:
# 위에서 말한것과 동일한 방법
df.iat[1, 1]

1.516107885029894

## Boolean indexing

In [75]:
# A열에서 0보다 큰 행을 모두 추출
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.569912,0.611761,1.011409,0.518493
2013-01-02,0.65477,1.516108,-1.589633,0.326186
2013-01-04,1.727305,-0.314997,1.035923,-1.770209


In [76]:
# 전체 데이터프레임에서 해당 조건을 충족하는 값만 추출, 나머지는 NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.569912,0.611761,1.011409,0.518493
2013-01-02,0.65477,1.516108,,0.326186
2013-01-03,,,,
2013-01-04,1.727305,,1.035923,
2013-01-05,,0.149301,0.176194,
2013-01-06,,0.764969,,0.810078


In [78]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.569912,0.611761,1.011409,0.518493,one
2013-01-02,0.65477,1.516108,-1.589633,0.326186,one
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468,two
2013-01-04,1.727305,-0.314997,1.035923,-1.770209,three
2013-01-05,-0.930456,0.149301,0.176194,-1.258266,four
2013-01-06,-1.315444,0.764969,-0.05425,0.810078,three


In [85]:
# 해당열에 있는 값이 있는 행을 뽑는다.
# 여기서 isin 내부에 있는 리스트값을 OR로 연결한 것 같음
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468,two
2013-01-05,-0.930456,0.149301,0.176194,-1.258266,four


## Setting

In [93]:
s1 = pd.Series([i for i in range(1, 7)], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [94]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.011409,5,
2013-01-02,0.65477,1.516108,-1.589633,5,1.0
2013-01-03,-0.389019,-0.006146,-0.116518,5,2.0
2013-01-04,1.727305,-0.314997,1.035923,5,3.0
2013-01-05,-0.930456,0.149301,0.176194,5,4.0
2013-01-06,-1.315444,0.764969,-0.05425,5,5.0


In [95]:
# 행에 접근하는 것을 인덱스를 갖는 변수를 활용한다는 것을 생각
df.at[dates[0], 'A'] = 0

In [88]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,1.011409,0.518493
2013-01-02,0.65477,1.516108,-1.589633,0.326186
2013-01-03,-0.389019,-0.006146,-0.116518,-0.972468
2013-01-04,1.727305,-0.314997,1.035923,-1.770209
2013-01-05,-0.930456,0.149301,0.176194,-1.258266
2013-01-06,-1.315444,0.764969,-0.05425,0.810078


#### 판다스 데이터를 추가할때 아래와 같은 표현 익숙해지기(넘파이 사용)

In [90]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,1.011409,5
2013-01-02,0.65477,1.516108,-1.589633,5
2013-01-03,-0.389019,-0.006146,-0.116518,5
2013-01-04,1.727305,-0.314997,1.035923,5
2013-01-05,-0.930456,0.149301,0.176194,5
2013-01-06,-1.315444,0.764969,-0.05425,5


#### 0 보다 큰 값을 모두 마이너스로 바꾸는 방법

## 이 부분 이해가 잘 안되네

In [98]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.011409,-5,
2013-01-02,-0.65477,-1.516108,-1.589633,-5,-1.0
2013-01-03,-0.389019,-0.006146,-0.116518,-5,-2.0
2013-01-04,-1.727305,-0.314997,-1.035923,-5,-3.0
2013-01-05,-0.930456,-0.149301,-0.176194,-5,-4.0
2013-01-06,-1.315444,-0.764969,-0.05425,-5,-5.0


## Missing data

reindex 하는 방법

다양한 방법으로 인덱스와 컬럼을 바꿈

In [113]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.011409,5,
2013-01-02,0.65477,1.516108,-1.589633,5,1.0
2013-01-03,-0.389019,-0.006146,-0.116518,5,2.0
2013-01-04,1.727305,-0.314997,1.035923,5,3.0
2013-01-05,-0.930456,0.149301,0.176194,5,4.0
2013-01-06,-1.315444,0.764969,-0.05425,5,5.0


In [114]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

#### 아래의 row 슬라이싱이 dates[1]의 값까지 포함하는 이유가 뭐지?

In [116]:
df1.loc[dates[0]:dates[1], 'E'] = 1

In [117]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.011409,5,,1.0
2013-01-02,0.65477,1.516108,-1.589633,5,1.0,1.0
2013-01-03,-0.389019,-0.006146,-0.116518,5,2.0,
2013-01-04,1.727305,-0.314997,1.035923,5,3.0,


In [125]:
# 하나의 값이라도 NaN이 있으면 그 행을 전체 제거
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.65477,1.516108,-1.589633,5,1.0,1.0


In [127]:
# axis값이 디폴트가 행으로 되어있음
df1.dropna(axis=1, how='any')

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,1.011409,5
2013-01-02,0.65477,1.516108,-1.589633,5
2013-01-03,-0.389019,-0.006146,-0.116518,5
2013-01-04,1.727305,-0.314997,1.035923,5


In [129]:
# 모든 NaN값을 5로 채우기
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.011409,5,5.0,1.0
2013-01-02,0.65477,1.516108,-1.589633,5,1.0,1.0
2013-01-03,-0.389019,-0.006146,-0.116518,5,2.0,5.0
2013-01-04,1.727305,-0.314997,1.035923,5,3.0,5.0


In [131]:
# df1에 있는 NaN값이 있는 경우 해당 부분을 True 불린 값으로 반환
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


## Opeartions

### Stats
오퍼레이션은 보통 널값을 무시하고 계산한다.

In [134]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.011409,5,
2013-01-02,0.65477,1.516108,-1.589633,5,1.0
2013-01-03,-0.389019,-0.006146,-0.116518,5,2.0
2013-01-04,1.727305,-0.314997,1.035923,5,3.0
2013-01-05,-0.930456,0.149301,0.176194,5,4.0
2013-01-06,-1.315444,0.764969,-0.05425,5,5.0


In [150]:
# 갑자기 생각난 컬럼 네임 바꾸기. 
# 1. rename - 1:1로 바꿈
# 2. .columns = [] 형태로 컬럼에 리스트 값을 반환
df_test = df.mean(axis=1).to_frame()
df_test.rename(columns={0:'mean'}, inplace=True)
df_test

Unnamed: 0,mean
2013-01-01,1.502852
2013-01-02,1.316249
2013-01-03,1.297663
2013-01-04,2.089646
2013-01-05,1.679008
2013-01-06,1.879055


In [154]:
# 데이터프레임의 평균 구하기 - 디폴트값은 열마다 구해짐 
# 우리가 보기엔 열인데, 그림상으론 행을 의미해서 디폴트 값이 0인듯?
df.mean()

A   -0.042141
B    0.351539
C    0.077188
D    5.000000
F    3.000000
dtype: float64

In [156]:
# 이게 1로 바꾼 값. 
# 엄밀히는 이게 행이라 0인거 같은데 열끼리 더해서 평균을 냈다고 봐야하나?
df.mean(1)

2013-01-01    1.502852
2013-01-02    1.316249
2013-01-03    1.297663
2013-01-04    2.089646
2013-01-05    1.679008
2013-01-06    1.879055
Freq: D, dtype: float64

Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension.

https://dandyrilla.github.io/2017-08-12/pandas-10min/

In [158]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
# shift(2)는 두개씩 밀어버리라는 의미인가?
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [160]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.011409,5,
2013-01-02,0.65477,1.516108,-1.589633,5,1.0
2013-01-03,-0.389019,-0.006146,-0.116518,5,2.0
2013-01-04,1.727305,-0.314997,1.035923,5,3.0
2013-01-05,-0.930456,0.149301,0.176194,5,4.0
2013-01-06,-1.315444,0.764969,-0.05425,5,5.0


In [161]:
# 인덱스 기준으로 데이터프레임에서 s라는 시리즈값이 있는 부분만 빼준 것.
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-1.389019,-1.006146,-1.116518,4.0,1.0
2013-01-04,-1.272695,-3.314997,-1.964077,2.0,0.0
2013-01-05,-5.930456,-4.850699,-4.823806,0.0,-1.0
2013-01-06,,,,,


### Apply

cumsum은 연속으로 더해주는 함수이다.

아래 내용은 조금 더 살펴봐야 할듯

In [164]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.011409,5,
2013-01-02,0.65477,1.516108,-1.589633,5,1.0
2013-01-03,-0.389019,-0.006146,-0.116518,5,2.0
2013-01-04,1.727305,-0.314997,1.035923,5,3.0
2013-01-05,-0.930456,0.149301,0.176194,5,4.0
2013-01-06,-1.315444,0.764969,-0.05425,5,5.0


In [163]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.011409,5,
2013-01-02,0.65477,1.516108,-0.578224,10,1.0
2013-01-03,0.265751,1.509962,-0.694741,15,3.0
2013-01-04,1.993056,1.194965,0.341181,20,6.0
2013-01-05,1.0626,1.344266,0.517375,25,10.0
2013-01-06,-0.252844,2.109235,0.463125,30,15.0


In [165]:
df.apply(lambda x: x.max() - x.min())

A    3.042749
B    1.831105
C    2.625556
D    0.000000
F    4.000000
dtype: float64

### Histogramming