# 10 minutes to pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20210107', periods=6)
dates

DatetimeIndex(['2021-01-07', '2021-01-08', '2021-01-09', '2021-01-10',
               '2021-01-11', '2021-01-12'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-01-07,-0.175395,-1.130095,-0.299972,0.554332
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375
2021-01-09,0.454833,-0.657559,-0.024256,1.022927
2021-01-10,2.529494,1.575409,1.772195,-1.035424
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262
2021-01-12,1.434673,-1.151261,-0.227772,-0.889229


In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20200101"),
        "C": pd.Series(1, index=list(range(4)), dtype='float32'),
        "D": np.array([3] * 4, dtype = "int32"),
        "E": pd.Categorical(['test', 'train', 'test', 'train']),
        "F":'foo',
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-01-01,1.0,3,test,foo
1,1.0,2020-01-01,1.0,3,train,foo
2,1.0,2020-01-01,1.0,3,test,foo
3,1.0,2020-01-01,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [9]:
# df2.<TAB> : 사용할 수 있는 내장 함수들을 보여준다

## Viewing Data

In [10]:
df.head()

Unnamed: 0,A,B,C,D
2021-01-07,-0.175395,-1.130095,-0.299972,0.554332
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375
2021-01-09,0.454833,-0.657559,-0.024256,1.022927
2021-01-10,2.529494,1.575409,1.772195,-1.035424
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262


In [11]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-01-10,2.529494,1.575409,1.772195,-1.035424
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262
2021-01-12,1.434673,-1.151261,-0.227772,-0.889229


In [12]:
df.index #row

DatetimeIndex(['2021-01-07', '2021-01-08', '2021-01-09', '2021-01-10',
               '2021-01-11', '2021-01-12'],
              dtype='datetime64[ns]', freq='D')

In [13]:
df.columns #columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [14]:
df.to_numpy() # DataFrame to Array

array([[-0.17539546, -1.13009471, -0.29997236,  0.55433247],
       [-1.74005752, -0.15936293,  1.61342115, -1.80837538],
       [ 0.45483265, -0.65755893, -0.02425637,  1.02292693],
       [ 2.52949378,  1.57540916,  1.77219467, -1.03542403],
       [-0.2436571 , -0.67625848,  1.74121265, -1.95726192],
       [ 1.43467311, -1.1512614 , -0.22777249, -0.88922883]])

In [15]:
df2.to_numpy() # 이때 row 와 column의 index 값은 빠진다.

array([[1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.376648,-0.366521,0.762471,-0.685505
std,1.47853,1.019167,1.042104,1.224589
min,-1.740058,-1.151261,-0.299972,-1.957262
25%,-0.226592,-1.016636,-0.176893,-1.615138
50%,0.139719,-0.666909,0.794582,-0.962326
75%,1.189713,-0.283912,1.709265,0.193442
max,2.529494,1.575409,1.772195,1.022927


In [17]:
df.T # row, column index를 서로 바꿈

Unnamed: 0,2021-01-07,2021-01-08,2021-01-09,2021-01-10,2021-01-11,2021-01-12
A,-0.175395,-1.740058,0.454833,2.529494,-0.243657,1.434673
B,-1.130095,-0.159363,-0.657559,1.575409,-0.676258,-1.151261
C,-0.299972,1.613421,-0.024256,1.772195,1.741213,-0.227772
D,0.554332,-1.808375,1.022927,-1.035424,-1.957262,-0.889229


In [18]:
df.sort_index(axis=1, ascending = False)
# index를 특정 기준에 따라 정렬
# axis = 1 : row, axis = 0 : column,  ascending = False : 내림차순, ascending = True : 오름차순

Unnamed: 0,D,C,B,A
2021-01-07,0.554332,-0.299972,-1.130095,-0.175395
2021-01-08,-1.808375,1.613421,-0.159363,-1.740058
2021-01-09,1.022927,-0.024256,-0.657559,0.454833
2021-01-10,-1.035424,1.772195,1.575409,2.529494
2021-01-11,-1.957262,1.741213,-0.676258,-0.243657
2021-01-12,-0.889229,-0.227772,-1.151261,1.434673


In [20]:
df.sort_values(by='B', ascending = False)
# 특정 열의 값을 정렬 기준으로 정렬

Unnamed: 0,A,B,C,D
2021-01-10,2.529494,1.575409,1.772195,-1.035424
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375
2021-01-09,0.454833,-0.657559,-0.024256,1.022927
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262
2021-01-07,-0.175395,-1.130095,-0.299972,0.554332
2021-01-12,1.434673,-1.151261,-0.227772,-0.889229


## Selection

In [21]:
df["A"]

2021-01-07   -0.175395
2021-01-08   -1.740058
2021-01-09    0.454833
2021-01-10    2.529494
2021-01-11   -0.243657
2021-01-12    1.434673
Freq: D, Name: A, dtype: float64

In [22]:
df[0:3] #index 번호로 일부 추출 가능

Unnamed: 0,A,B,C,D
2021-01-07,-0.175395,-1.130095,-0.299972,0.554332
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375
2021-01-09,0.454833,-0.657559,-0.024256,1.022927


In [24]:
df['20210107' : '20210110'] # 인덱스 이름을 직접 입력하여 일부 추출 가능

Unnamed: 0,A,B,C,D
2021-01-07,-0.175395,-1.130095,-0.299972,0.554332
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375
2021-01-09,0.454833,-0.657559,-0.024256,1.022927
2021-01-10,2.529494,1.575409,1.772195,-1.035424


In [26]:
# Selection by label
print(dates)
df.loc[dates[0]]

DatetimeIndex(['2021-01-07', '2021-01-08', '2021-01-09', '2021-01-10',
               '2021-01-11', '2021-01-12'],
              dtype='datetime64[ns]', freq='D')


A   -0.175395
B   -1.130095
C   -0.299972
D    0.554332
Name: 2021-01-07 00:00:00, dtype: float64

In [27]:
df.loc[:, ['A', 'B']] # row, column 동시에 가능

Unnamed: 0,A,B
2021-01-07,-0.175395,-1.130095
2021-01-08,-1.740058,-0.159363
2021-01-09,0.454833,-0.657559
2021-01-10,2.529494,1.575409
2021-01-11,-0.243657,-0.676258
2021-01-12,1.434673,-1.151261


In [28]:
df.loc['20210107':'20210110', ['A', 'B']]

Unnamed: 0,A,B
2021-01-07,-0.175395,-1.130095
2021-01-08,-1.740058,-0.159363
2021-01-09,0.454833,-0.657559
2021-01-10,2.529494,1.575409


In [30]:
df.loc[:'20210108', ['A', 'C']]

Unnamed: 0,A,C
2021-01-07,-0.175395,-0.299972
2021-01-08,-1.740058,1.613421


In [31]:
df.loc[dates[0], 'A']

-0.1753954648371717

In [32]:
# 하나의 데이터 값을 가져올 때 .loc 보다 .at가 더 빠르다.
df.at[dates[0], 'A']

-0.1753954648371717

In [33]:
#Selection By position
df.iloc[3]

A    2.529494
B    1.575409
C    1.772195
D   -1.035424
Name: 2021-01-10 00:00:00, dtype: float64

In [34]:
df.iloc[3:5, 0:2] #indexing

Unnamed: 0,A,B
2021-01-10,2.529494,1.575409
2021-01-11,-0.243657,-0.676258


In [35]:
df.iloc[[1, 2, 4], [1, 3]] # 구체적인 위치 값으로 일부 추출 가능

Unnamed: 0,B,D
2021-01-08,-0.159363,-1.808375
2021-01-09,-0.657559,1.022927
2021-01-11,-0.676258,-1.957262


In [37]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375
2021-01-09,0.454833,-0.657559,-0.024256,1.022927


In [38]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2021-01-07,-1.130095,-0.299972
2021-01-08,-0.159363,1.613421
2021-01-09,-0.657559,-0.024256
2021-01-10,1.575409,1.772195
2021-01-11,-0.676258,1.741213
2021-01-12,-1.151261,-0.227772


In [39]:
df.iloc[1,1]

-0.15936292584042605

In [40]:
# .iloc 보다 .iat이 더 빠르다. ( 위 .loc의 경우와 동일)
df.iat[1,1]

-0.15936292584042605

In [42]:
#Boolean Indexing
df[df['A'] > 0]
# 열의 값과 조건을 가지고 부분 추출한다.

Unnamed: 0,A,B,C,D
2021-01-09,0.454833,-0.657559,-0.024256,1.022927
2021-01-10,2.529494,1.575409,1.772195,-1.035424
2021-01-12,1.434673,-1.151261,-0.227772,-0.889229


In [43]:
# 조건에 충족하는 값만 출력하고, 나머지는 NaN으로 출력한다.
df[df > 0]

Unnamed: 0,A,B,C,D
2021-01-07,,,,0.554332
2021-01-08,,,1.613421,
2021-01-09,0.454833,,,1.022927
2021-01-10,2.529494,1.575409,1.772195,
2021-01-11,,,1.741213,
2021-01-12,1.434673,,,


In [44]:
df2 = df.copy()

In [46]:
df2["E"] = ['one', 'one', 'two', 'three', 'four','five']
df2

Unnamed: 0,A,B,C,D,E
2021-01-07,-0.175395,-1.130095,-0.299972,0.554332,one
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375,one
2021-01-09,0.454833,-0.657559,-0.024256,1.022927,two
2021-01-10,2.529494,1.575409,1.772195,-1.035424,three
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262,four
2021-01-12,1.434673,-1.151261,-0.227772,-0.889229,five


In [48]:
# .isin()을 통한 필터링
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2021-01-09,0.454833,-0.657559,-0.024256,1.022927,two
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262,four


In [54]:
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20200107', periods=6))
s1

2020-01-07    1
2020-01-08    2
2020-01-09    3
2020-01-10    4
2020-01-11    5
2020-01-12    6
Freq: D, dtype: int64

In [65]:
df['F'] = [1,2,3,4,5,6]
df

Unnamed: 0,A,B,C,D,F
2021-01-07,-0.175395,-1.130095,-0.299972,0.554332,1
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375,2
2021-01-09,0.454833,-0.657559,-0.024256,1.022927,3
2021-01-10,2.529494,1.575409,1.772195,-1.035424,4
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262,5
2021-01-12,1.434673,-1.151261,-0.227772,-0.889229,6


In [66]:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2021-01-07,0.0,-1.130095,-0.299972,0.554332,1
2021-01-08,-1.740058,-0.159363,1.613421,-1.808375,2
2021-01-09,0.454833,-0.657559,-0.024256,1.022927,3
2021-01-10,2.529494,1.575409,1.772195,-1.035424,4
2021-01-11,-0.243657,-0.676258,1.741213,-1.957262,5
2021-01-12,1.434673,-1.151261,-0.227772,-0.889229,6


In [77]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2021-01-07,0.0,0.0,-0.299972,5,1
2021-01-08,-1.740058,-0.159363,1.613421,5,2
2021-01-09,0.454833,-0.657559,-0.024256,5,3
2021-01-10,2.529494,1.575409,1.772195,5,4
2021-01-11,-0.243657,-0.676258,1.741213,5,5
2021-01-12,1.434673,-1.151261,-0.227772,5,6


In [78]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2021-01-07,0.0,0.0,-0.299972,5,1
2021-01-08,-1.740058,-0.159363,1.613421,5,2
2021-01-09,0.454833,-0.657559,-0.024256,5,3
2021-01-10,2.529494,1.575409,1.772195,5,4
2021-01-11,-0.243657,-0.676258,1.741213,5,5
2021-01-12,1.434673,-1.151261,-0.227772,5,6


In [79]:
df2 = df.copy()

In [80]:
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2021-01-07,0.0,0.0,-0.299972,-5,-1
2021-01-08,-1.740058,-0.159363,-1.613421,-5,-2
2021-01-09,-0.454833,-0.657559,-0.024256,-5,-3
2021-01-10,-2.529494,-1.575409,-1.772195,-5,-4
2021-01-11,-0.243657,-0.676258,-1.741213,-5,-5
2021-01-12,-1.434673,-1.151261,-0.227772,-5,-6


## Missing Data

In [96]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"]) 
df1

Unnamed: 0,A,B,C,D,F,E
2021-01-07,0.0,0.0,-0.299972,5,1,
2021-01-08,-1.740058,-0.159363,1.613421,5,2,
2021-01-09,0.454833,-0.657559,-0.024256,5,3,
2021-01-10,2.529494,1.575409,1.772195,5,4,


In [97]:
df1.loc[dates[0] :dates[1], 'E'] = 1

In [98]:
df1

Unnamed: 0,A,B,C,D,F,E
2021-01-07,0.0,0.0,-0.299972,5,1,1.0
2021-01-08,-1.740058,-0.159363,1.613421,5,2,1.0
2021-01-09,0.454833,-0.657559,-0.024256,5,3,
2021-01-10,2.529494,1.575409,1.772195,5,4,


In [88]:
# 누락된 데이터가 있는 행을 삭제
df1.dropna(how = 'any')

Unnamed: 0,A,B,C,D,F,E
2021-01-07,0.0,0.0,-0.299972,5,1,1.0
2021-01-08,-1.740058,-0.159363,1.613421,5,2,1.0


In [90]:
# 누락된 데이터를 특정 값으로 채우기
df1.fillna(value = 5)

Unnamed: 0,A,B,C,D,F,E
2021-01-07,0.0,0.0,-0.299972,5,1,1.0
2021-01-08,-1.740058,-0.159363,1.613421,5,2,1.0
2021-01-09,0.454833,-0.657559,-0.024256,5,3,5.0
2021-01-10,2.529494,1.575409,1.772195,5,4,5.0


In [91]:
# 누락된 값인지 True/False로 나타낸다.
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2021-01-07,False,False,False,False,False,False
2021-01-08,False,False,False,False,False,False
2021-01-09,False,False,False,False,False,True
2021-01-10,False,False,False,False,False,True


In [94]:
# 이렇게도 누락된 값을 채울 수 있다.
df1[pd.isna(df1)] = 0

## Operations

In [101]:
# 통계
df.mean() # Default : 각 열의 평균값을 구한다

A    0.405881
B   -0.178172
C    0.762471
D    5.000000
F    3.500000
dtype: float64

In [102]:
df.mean(1) # 각 행의 값을 구한다

2021-01-07    1.140006
2021-01-08    1.342800
2021-01-09    1.554603
2021-01-10    2.975420
2021-01-11    2.164259
2021-01-12    2.211128
Freq: D, dtype: float64

In [103]:
# .shift(n) -> n 만큼 차원이동
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)

In [105]:
s

2021-01-07    NaN
2021-01-08    NaN
2021-01-09    1.0
2021-01-10    3.0
2021-01-11    5.0
2021-01-12    NaN
Freq: D, dtype: float64

In [106]:
# 값 대체
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2021-01-07,,,,,
2021-01-08,,,,,
2021-01-09,-0.545167,-1.657559,-1.024256,4.0,2.0
2021-01-10,-0.470506,-1.424591,-1.227805,2.0,1.0
2021-01-11,-5.243657,-5.676258,-3.258787,0.0,0.0
2021-01-12,,,,,


### Apply

In [107]:
# 각 데이터를 특정 함수에 적용시킨다.
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2021-01-07,0.0,0.0,-0.299972,5,1
2021-01-08,-1.740058,-0.159363,1.313449,10,3
2021-01-09,-1.285225,-0.816922,1.289192,15,6
2021-01-10,1.244269,0.758487,3.061387,20,10
2021-01-11,1.000612,0.082229,4.8026,25,15
2021-01-12,2.435285,-1.069033,4.574827,30,21


In [108]:
df.apply(lambda x: x.max() - x.min())

A    4.269551
B    2.726671
C    2.072167
D    0.000000
F    5.000000
dtype: float64

### Histogramming

In [110]:
s = pd.Series(np.random.randint(0, 7, size = 10))
s

0    4
1    0
2    6
3    5
4    6
5    5
6    5
7    6
8    6
9    0
dtype: int64

In [111]:
# 각 data 종류의 개수를 센다
s.value_counts()

6    4
5    3
0    2
4    1
dtype: int64

In [112]:
# String method
s = pd.Series(['A', 'B', 'C', 'Aaba', 'BaCa', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Concat

In [113]:
# concat()
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,1.861281,-0.686008,-0.150279,-0.047568
1,-1.832943,-1.37042,0.408924,0.10992
2,-0.524738,-1.566669,-0.507094,-1.943666
3,-1.048533,-1.023668,-1.070418,-1.715142
4,-0.048619,0.834597,0.717335,0.920413
5,-1.087691,1.084009,-0.099066,0.435898
6,1.594935,-0.175187,-1.00724,-1.489332
7,0.934411,-0.169434,0.295657,-1.357008
8,-1.549284,-0.00828,-0.116159,-1.25176
9,-0.991819,0.944153,-1.162736,1.499897


In [114]:
pieces = [df[:3], df[3:7], df[7:]]

In [115]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.861281,-0.686008,-0.150279,-0.047568
1,-1.832943,-1.37042,0.408924,0.10992
2,-0.524738,-1.566669,-0.507094,-1.943666
3,-1.048533,-1.023668,-1.070418,-1.715142
4,-0.048619,0.834597,0.717335,0.920413
5,-1.087691,1.084009,-0.099066,0.435898
6,1.594935,-0.175187,-1.00724,-1.489332
7,0.934411,-0.169434,0.295657,-1.357008
8,-1.549284,-0.00828,-0.116159,-1.25176
9,-0.991819,0.944153,-1.162736,1.499897


### Join

In [118]:
left = pd.DataFrame({'key':['foo', 'foo'], 'lval':[1,2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [119]:
right = pd.DataFrame({'key':['foo', 'foo'], 'rval':[3, 4]})
right

Unnamed: 0,key,rval
0,foo,3
1,foo,4


In [120]:
# key를 기준으로 left와 right를 합친다.
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [122]:
# 두 개 뭔 차이지
left = pd.DataFrame({'key':['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo', 'bar'], 'rval':[3, 4]})

pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,bar,2,4


## Grouping
- 일부 기준에 따라 데이터를 그룹으로 분할
- 각 그룹에 독립적으로 기능 적용
- 결과를 데이터 구조로 결합

In [123]:
df = pd.DataFrame({
    "A" : ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
    "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
    "C" : np.random.randn(8),
    "D" : np.random.randn(8)
})

df

Unnamed: 0,A,B,C,D
0,foo,one,2.278435,-0.209557
1,bar,one,-0.085339,0.308446
2,foo,two,-1.149588,0.896932
3,bar,three,0.193644,-0.648219
4,foo,two,-1.204048,1.702962
5,bar,two,-1.070997,0.44364
6,foo,one,1.104691,0.100275
7,foo,three,-1.170223,2.283838


In [124]:
# A 컬럼의 데이터를 그룹화하여 각 그룹의 합계를 구한다.
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.962691,0.103867
foo,-0.140732,4.77445


In [125]:
# A, B를 그룹화하여 그룹의 합계를 구한다.
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.085339,0.308446
bar,three,0.193644,-0.648219
bar,two,-1.070997,0.44364
foo,one,3.383127,-0.109282
foo,three,-1.170223,2.283838
foo,two,-2.353636,2.599894
