# Pandas 한번에 제대로 배우기




---



In [1]:
import numpy as np
import pandas as pd
pd.__version__

'1.3.1'

## Pandas 객체


### Series 객체

In [2]:
s = pd.Series([0, 0.25,0.5,0.75,1.0])
s

0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [3]:
s.values

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [4]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
s[1]

0.25

In [6]:
s[1:4]

1    0.25
2    0.50
3    0.75
dtype: float64

In [7]:
s = pd.Series([0,0.25,0.5,0.75,1.0],
              index = ['a','b','c','d','e'])
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [8]:
'b' in s

True

In [9]:
s[['c','d','e']]

c    0.50
d    0.75
e    1.00
dtype: float64

In [10]:
s.unique()

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [11]:
s.value_counts()

0.00    1
0.25    1
0.50    1
0.75    1
1.00    1
dtype: int64

In [12]:
s.isin([0.25, 0.75])

a    False
b     True
c    False
d     True
e    False
dtype: bool

In [13]:
pop_tuple ={'서울특별시':9723,
            '부산광역시': 9898,
            '광주광역시': 9999}
population = pd.Series(pop_tuple)
population

서울특별시    9723
부산광역시    9898
광주광역시    9999
dtype: int64

In [14]:
population['서울특별시']

9723

In [15]:
population['서울특별시':'광주광역시']

서울특별시    9723
부산광역시    9898
광주광역시    9999
dtype: int64

### DataFrame 객체

In [16]:
pd.DataFrame([{'a':2,'b':4,'c':3},{'a':4,'b':5,'c':7}])

Unnamed: 0,a,b,c
0,2,4,3
1,4,5,7


In [17]:
pd.DataFrame(np.random.rand(5,5),
             columns=['a','b','c','d','e'],
             index=[1,2,3,4,5])

Unnamed: 0,a,b,c,d,e
1,0.877877,0.086639,0.989084,0.352069,0.766079
2,0.606071,0.548123,0.880573,0.095935,0.289382
3,0.192427,0.508506,0.846547,0.601735,0.554626
4,0.500863,0.600878,0.057517,0.012025,0.599677
5,0.673553,0.444401,0.536535,0.656981,0.086311


In [18]:
male_tuple ={'서울특별시':9723,
            '부산광역시': 9898,
            '광주광역시': 9999}
male_tuple = pd.Series(male_tuple)
male_tuple

서울특별시    9723
부산광역시    9898
광주광역시    9999
dtype: int64

In [19]:
female_tuple ={'서울특별시':923,
            '부산광역시': 898,
            '광주광역시': 999}
female_tuple = pd.Series(female_tuple)
female_tuple

서울특별시    923
부산광역시    898
광주광역시    999
dtype: int64

In [20]:
korea_df = pd.DataFrame({'인구수':population,
                         '남자인구수' : male_tuple,
                         '여자인구수' : female_tuple})
korea_df

Unnamed: 0,인구수,남자인구수,여자인구수
서울특별시,9723,9723,923
부산광역시,9898,9898,898
광주광역시,9999,9999,999


In [21]:
korea_df.index

Index(['서울특별시', '부산광역시', '광주광역시'], dtype='object')

In [22]:
korea_df.columns

Index(['인구수', '남자인구수', '여자인구수'], dtype='object')

In [23]:
korea_df['여자인구수']

서울특별시    923
부산광역시    898
광주광역시    999
Name: 여자인구수, dtype: int64

### Index 객체


In [24]:
idx = pd.Index([2,4,6,8,10])
idx

Int64Index([2, 4, 6, 8, 10], dtype='int64')

In [25]:
idx[1]

4

In [26]:
idx[1:2:2]

Int64Index([4], dtype='int64')

In [27]:
idx[-1::]

Int64Index([10], dtype='int64')

In [28]:
idx[::2]

Int64Index([2, 6, 10], dtype='int64')

In [29]:
print(idx)
print(idx.size)
print(idx.ndim)
print(idx.shape)
print(idx.dtype)

Int64Index([2, 4, 6, 8, 10], dtype='int64')
5
1
(5,)
int64


#### Index 연산

In [30]:
idx1 = pd.Index([1,2,4,6,8])
idx2 = pd.Index([2,4,5,6,7])
print(idx1.append(idx2))
print(idx1.difference(idx2))
print(idx1 - idx2)
print(idx1.intersection(idx2))
print(idx1 & idx2)
print(idx1.union(idx2))
print(idx1 | idx2)
print(idx1.delete(0))
print(idx1.drop(1))
print(idx1 ^ idx2)


Int64Index([1, 2, 4, 6, 8, 2, 4, 5, 6, 7], dtype='int64')
Int64Index([1, 8], dtype='int64')
Int64Index([-1, -2, -1, 0, 1], dtype='int64')
Int64Index([2, 4, 6], dtype='int64')
Int64Index([2, 4, 6], dtype='int64')
Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
Int64Index([2, 4, 6, 8], dtype='int64')
Int64Index([2, 4, 6, 8], dtype='int64')
Int64Index([1, 5, 7, 8], dtype='int64')


  print(idx1 & idx2)
  print(idx1 | idx2)
  print(idx1 ^ idx2)




---



## 인덱싱(Indexing)

In [31]:
s = pd.Series([0,0.25,0.5,0.75,1.0],
              index=['a','b','c','d','e'])
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [32]:
s['b']

0.25

In [33]:
'b' in s

True

In [34]:
s.keys()

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [35]:
list(s.items())

[('a', 0.0), ('b', 0.25), ('c', 0.5), ('d', 0.75), ('e', 1.0)]

In [36]:
s['f'] = 1.25
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
f    1.25
dtype: float64

In [37]:
s['a':'d']

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64

In [38]:
s[0:4]

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64

In [39]:
s[(s>0.4) & (s < 0.8)]

c    0.50
d    0.75
dtype: float64

In [40]:
s[['a','c','e']]

a    0.0
c    0.5
e    1.0
dtype: float64

### Series 인덱싱

In [41]:
s = pd.Series(['a','b','c','d','e'],
             index=[1,3,5,7,9])
s

1    a
3    b
5    c
7    d
9    e
dtype: object

In [42]:
s[1]

'a'

In [43]:
s[2:4]

5    c
7    d
dtype: object

In [44]:
s.iloc[1]

'b'

In [45]:
s.iloc[2:4]

5    c
7    d
dtype: object

In [46]:
s.reindex(range(10))

0    NaN
1      a
2    NaN
3      b
4    NaN
5      c
6    NaN
7      d
8    NaN
9      e
dtype: object

In [47]:
s.reindex(range(10), method='bfill')

0    a
1    a
2    b
3    b
4    c
5    c
6    d
7    d
8    e
9    e
dtype: object

### DataFrame 인덱싱


In [48]:
korea_df['남자인구수']

서울특별시    9723
부산광역시    9898
광주광역시    9999
Name: 남자인구수, dtype: int64

In [49]:
korea_df.남자인구수

서울특별시    9723
부산광역시    9898
광주광역시    9999
Name: 남자인구수, dtype: int64

In [50]:
korea_df.여자인구수

서울특별시    923
부산광역시    898
광주광역시    999
Name: 여자인구수, dtype: int64

In [51]:
korea_df['남여비율'] = (korea_df['남자인구수'] * 100 / korea_df['여자인구수'])
korea_df.남여비율

서울특별시    1053.412784
부산광역시    1102.227171
광주광역시    1000.900901
Name: 남여비율, dtype: float64

In [52]:
korea_df.values

array([[9723.        , 9723.        ,  923.        , 1053.4127844 ],
       [9898.        , 9898.        ,  898.        , 1102.22717149],
       [9999.        , 9999.        ,  999.        , 1000.9009009 ]])

In [53]:
korea_df.T

Unnamed: 0,서울특별시,부산광역시,광주광역시
인구수,9723.0,9898.0,9999.0
남자인구수,9723.0,9898.0,9999.0
여자인구수,923.0,898.0,999.0
남여비율,1053.412784,1102.227171,1000.900901


In [54]:
korea_df.values[0]

array([9723.       , 9723.       ,  923.       , 1053.4127844])

In [55]:
korea_df['인구수']

서울특별시    9723
부산광역시    9898
광주광역시    9999
Name: 인구수, dtype: int64

In [56]:
korea_df.loc[:'광주광역시',:'남자인구수']

Unnamed: 0,인구수,남자인구수
서울특별시,9723,9723
부산광역시,9898,9898
광주광역시,9999,9999


In [57]:
korea_df.loc[(korea_df.여자인구수 > 100)]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9723,9723,923,1053.412784
부산광역시,9898,9898,898,1102.227171
광주광역시,9999,9999,999,1000.900901


In [58]:
korea_df.loc[(korea_df.인구수 < 20000)]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9723,9723,923,1053.412784
부산광역시,9898,9898,898,1102.227171
광주광역시,9999,9999,999,1000.900901


In [59]:
korea_df.loc[(korea_df.인구수 < 9900)]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9723,9723,923,1053.412784
부산광역시,9898,9898,898,1102.227171


In [60]:
korea_df.loc[korea_df.남여비율 > 100]

Unnamed: 0,인구수,남자인구수,여자인구수,남여비율
서울특별시,9723,9723,923,1053.412784
부산광역시,9898,9898,898,1102.227171
광주광역시,9999,9999,999,1000.900901


In [61]:
korea_df.iloc[:3,:2]

Unnamed: 0,인구수,남자인구수
서울특별시,9723,9723
부산광역시,9898,9898
광주광역시,9999,9999


### 다중 인덱싱(Multi Indexing)

* 1차원의 Series와 2차원의 DataFrame 객체를 넘어 3차원, 4차원 이상의 고차원 데이터 처리
* 단일 인덱스 내에 여러 인덱스를 포함하는 다중 인덱싱

#### 다중 인덱스 Series

In [62]:
idx_tuples = [('서울특별시', 2010), ('서울특별시', 2020),
             ('부산광역시', 2010), ('부산광역시', 2020),
             ('인천광역시', 2010), ('인천광역시', 2020),
             ('대구광역시', 2010), ('대구광역시', 2020),
             ('대전광역시', 2010), ('대전광역시', 2020),
             ('광주광역시', 2010), ('광주광역시', 2020)]
idx_tuples

[('서울특별시', 2010),
 ('서울특별시', 2020),
 ('부산광역시', 2010),
 ('부산광역시', 2020),
 ('인천광역시', 2010),
 ('인천광역시', 2020),
 ('대구광역시', 2010),
 ('대구광역시', 2020),
 ('대전광역시', 2010),
 ('대전광역시', 2020),
 ('광주광역시', 2010),
 ('광주광역시', 2020)]

In [63]:
pop_tuples = [100000,200000,
             200000,300000,
             400000,403030,
             2020202,2010207,
             2010101,4212334,
             1010202,9998777]
population = pd.Series(pop_tuples, index=idx_tuples)
population

(서울특별시, 2010)     100000
(서울특별시, 2020)     200000
(부산광역시, 2010)     200000
(부산광역시, 2020)     300000
(인천광역시, 2010)     400000
(인천광역시, 2020)     403030
(대구광역시, 2010)    2020202
(대구광역시, 2020)    2010207
(대전광역시, 2010)    2010101
(대전광역시, 2020)    4212334
(광주광역시, 2010)    1010202
(광주광역시, 2020)    9998777
dtype: int64

In [64]:
midx = pd.MultiIndex.from_tuples(idx_tuples)
midx

MultiIndex([('서울특별시', 2010),
            ('서울특별시', 2020),
            ('부산광역시', 2010),
            ('부산광역시', 2020),
            ('인천광역시', 2010),
            ('인천광역시', 2020),
            ('대구광역시', 2010),
            ('대구광역시', 2020),
            ('대전광역시', 2010),
            ('대전광역시', 2020),
            ('광주광역시', 2010),
            ('광주광역시', 2020)],
           )

In [65]:
population = population.reindex(midx)
population

서울특별시  2010     100000
       2020     200000
부산광역시  2010     200000
       2020     300000
인천광역시  2010     400000
       2020     403030
대구광역시  2010    2020202
       2020    2010207
대전광역시  2010    2010101
       2020    4212334
광주광역시  2010    1010202
       2020    9998777
dtype: int64

In [66]:
population[:,2010]

서울특별시     100000
부산광역시     200000
인천광역시     400000
대구광역시    2020202
대전광역시    2010101
광주광역시    1010202
dtype: int64

In [67]:
population['대전광역시',:]

2010    2010101
2020    4212334
dtype: int64

In [68]:
korea_mdf = population.unstack() # 다중인덱싱 -> 인덱스로 변환
korea_mdf

Unnamed: 0,2010,2020
광주광역시,1010202,9998777
대구광역시,2020202,2010207
대전광역시,2010101,4212334
부산광역시,200000,300000
서울특별시,100000,200000
인천광역시,400000,403030


In [69]:
korea_mdf.stack()

광주광역시  2010    1010202
       2020    9998777
대구광역시  2010    2020202
       2020    2010207
대전광역시  2010    2010101
       2020    4212334
부산광역시  2010     200000
       2020     300000
서울특별시  2010     100000
       2020     200000
인천광역시  2010     400000
       2020     403030
dtype: int64

In [70]:
male_tuple = [10000,20000,
             20000,30000,
             40000,40300,
             202002,201007,
             201101,421234,
             100202,999877]
male_tuple

[10000,
 20000,
 20000,
 30000,
 40000,
 40300,
 202002,
 201007,
 201101,
 421234,
 100202,
 999877]

In [71]:
korea_mdf = pd.DataFrame({'총인구수':population,
                        '남자인구수':male_tuple})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,남자인구수
서울특별시,2010,100000,10000
서울특별시,2020,200000,20000
부산광역시,2010,200000,20000
부산광역시,2020,300000,30000
인천광역시,2010,400000,40000
인천광역시,2020,403030,40300
대구광역시,2010,2020202,202002
대구광역시,2020,2010207,201007
대전광역시,2010,2010101,201101
대전광역시,2020,4212334,421234


In [72]:
female_tuple = [100,200,
             9000,9000,
             400,403,
             2020,2010,
             2001,4214,
             1002,9997]
female_tuple

[100, 200, 9000, 9000, 400, 403, 2020, 2010, 2001, 4214, 1002, 9997]

In [73]:
korea_mdf = pd.DataFrame({'총인구수':population,
                         '남자인구수':male_tuple,
                         '여자인구수':female_tuple})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,남자인구수,여자인구수
서울특별시,2010,100000,10000,100
서울특별시,2020,200000,20000,200
부산광역시,2010,200000,20000,9000
부산광역시,2020,300000,30000,9000
인천광역시,2010,400000,40000,400
인천광역시,2020,403030,40300,403
대구광역시,2010,2020202,202002,2020
대구광역시,2020,2010207,201007,2010
대전광역시,2010,2010101,201101,2001
대전광역시,2020,4212334,421234,4214


In [74]:
ratio = korea_mdf['남자인구수'] * 100 / korea_mdf['여자인구수']
ratio

서울특별시  2010    10000.000000
       2020    10000.000000
부산광역시  2010      222.222222
       2020      333.333333
인천광역시  2010    10000.000000
       2020    10000.000000
대구광역시  2010    10000.099010
       2020    10000.348259
대전광역시  2010    10050.024988
       2020     9996.060750
광주광역시  2010    10000.199601
       2020    10001.770531
dtype: float64

In [75]:
ratio.unstack()

Unnamed: 0,2010,2020
광주광역시,10000.199601,10001.770531
대구광역시,10000.09901,10000.348259
대전광역시,10050.024988,9996.06075
부산광역시,222.222222,333.333333
서울특별시,10000.0,10000.0
인천광역시,10000.0,10000.0


In [76]:
korea_mdf = pd.DataFrame({'총인구수':population,
                         '남자인구수':male_tuple,
                         '여자인구수':female_tuple,
                         '남여비율':ratio})
korea_mdf

Unnamed: 0,Unnamed: 1,총인구수,남자인구수,여자인구수,남여비율
서울특별시,2010,100000,10000,100,10000.0
서울특별시,2020,200000,20000,200,10000.0
부산광역시,2010,200000,20000,9000,222.222222
부산광역시,2020,300000,30000,9000,333.333333
인천광역시,2010,400000,40000,400,10000.0
인천광역시,2020,403030,40300,403,10000.0
대구광역시,2010,2020202,202002,2020,10000.09901
대구광역시,2020,2010207,201007,2010,10000.348259
대전광역시,2010,2010101,201101,2001,10050.024988
대전광역시,2020,4212334,421234,4214,9996.06075


#### 다중 인덱스 생성

In [77]:
df = pd.DataFrame(np.random.rand(6,3),
                 index=[['a','a','b','b','c','c'], [1,2,1,2,1,2]],
                 columns=['c1','c2','c3'])
df

Unnamed: 0,Unnamed: 1,c1,c2,c3
a,1,0.567022,0.734928,0.241799
a,2,0.344684,0.153564,0.179844
b,1,0.228863,0.542069,0.5651
b,2,0.578688,0.558334,0.43436
c,1,0.742261,0.69943,0.649841
c,2,0.040363,0.552271,0.544391


In [78]:
pd.MultiIndex.from_arrays([['a','a','b','b','c','c'], [1,2,1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [79]:
pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2), ('c',1),('c',2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [80]:
pd.MultiIndex.from_product([['a','b','c'],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [81]:
pd.MultiIndex(levels=[['a','b','c'], [1,2]],
             codes=[[0,0,1,1,2,2], [0,1,0,1,0,1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )

In [82]:
population.index.names = ['행정구역','년도']
population

행정구역   년도  
서울특별시  2010     100000
       2020     200000
부산광역시  2010     200000
       2020     300000
인천광역시  2010     400000
       2020     403030
대구광역시  2010    2020202
       2020    2010207
대전광역시  2010    2010101
       2020    4212334
광주광역시  2010    1010202
       2020    9998777
dtype: int64

In [83]:
idx = pd.MultiIndex.from_product([['a','b','c'],[1,2]],
                                names=['name1','name2'])
cols = pd.MultiIndex.from_product([['c1','c2','c3'], [1,2]],
                                 names = ['col_name1','col_name2'])
data = np.round(np.random.randn(6,6),2)
mdf = pd.DataFrame(data, index=idx, columns=cols)
mdf

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,col_name2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,1.43,1.26,0.94,1.1,0.87,1.31
a,2,-0.68,-0.17,-0.69,0.92,0.16,0.3
b,1,0.38,0.33,-1.12,-0.11,0.09,-1.52
b,2,0.96,-1.38,-0.44,1.37,-0.16,0.35
c,1,-0.08,0.73,1.47,-0.25,1.36,1.12
c,2,0.17,0.77,-0.82,-0.47,-0.66,-0.21


In [84]:
mdf['c2']

Unnamed: 0_level_0,col_name2,1,2
name1,name2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.94,1.1
a,2,-0.69,0.92
b,1,-1.12,-0.11
b,2,-0.44,1.37
c,1,1.47,-0.25
c,2,-0.82,-0.47


#### 인덱싱 및 슬라이싱

In [85]:
population

행정구역   년도  
서울특별시  2010     100000
       2020     200000
부산광역시  2010     200000
       2020     300000
인천광역시  2010     400000
       2020     403030
대구광역시  2010    2020202
       2020    2010207
대전광역시  2010    2010101
       2020    4212334
광주광역시  2010    1010202
       2020    9998777
dtype: int64

In [86]:
population['인천광역시', 2010]

400000

In [87]:
population[:,2010]

행정구역
서울특별시     100000
부산광역시     200000
인천광역시     400000
대구광역시    2020202
대전광역시    2010101
광주광역시    1010202
dtype: int64

In [88]:
population[population > 300000]

행정구역   년도  
인천광역시  2010     400000
       2020     403030
대구광역시  2010    2020202
       2020    2010207
대전광역시  2010    2010101
       2020    4212334
광주광역시  2010    1010202
       2020    9998777
dtype: int64

In [89]:
population[['대구광역시','대전광역시']]

행정구역   년도  
대구광역시  2010    2020202
       2020    2010207
대전광역시  2010    2010101
       2020    4212334
dtype: int64

In [90]:
mdf

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2,c3,c3
Unnamed: 0_level_1,col_name2,1,2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,1,1.43,1.26,0.94,1.1,0.87,1.31
a,2,-0.68,-0.17,-0.69,0.92,0.16,0.3
b,1,0.38,0.33,-1.12,-0.11,0.09,-1.52
b,2,0.96,-1.38,-0.44,1.37,-0.16,0.35
c,1,-0.08,0.73,1.47,-0.25,1.36,1.12
c,2,0.17,0.77,-0.82,-0.47,-0.66,-0.21


In [91]:
mdf['c2',1]

name1  name2
a      1        0.94
       2       -0.69
b      1       -1.12
       2       -0.44
c      1        1.47
       2       -0.82
Name: (c2, 1), dtype: float64

In [92]:
mdf.iloc[:3,:4]

Unnamed: 0_level_0,col_name1,c1,c1,c2,c2
Unnamed: 0_level_1,col_name2,1,2,1,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,1.43,1.26,0.94,1.1
a,2,-0.68,-0.17,-0.69,0.92
b,1,0.38,0.33,-1.12,-0.11


In [93]:
mdf.loc[:,('c2',1)]

name1  name2
a      1        0.94
       2       -0.69
b      1       -1.12
       2       -0.44
c      1        1.47
       2       -0.82
Name: (c2, 1), dtype: float64

In [94]:
idx_slice = pd.IndexSlice
mdf.loc[idx_slice[:,2], idx_slice[:,2]]

Unnamed: 0_level_0,col_name1,c1,c2,c3
Unnamed: 0_level_1,col_name2,2,2,2
name1,name2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,2,-0.17,0.92,0.3
b,2,-1.38,1.37,0.35
c,2,0.77,-0.47,-0.21


#### 다중 인덱스 재정렬

In [95]:
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           names=['name1', 'name2'])

In [96]:
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남여비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,100000,10000,100,10000.0
서울특별시,2020,200000,20000,200,10000.0
부산광역시,2010,200000,20000,9000,222.222222
부산광역시,2020,300000,30000,9000,333.333333
인천광역시,2010,400000,40000,400,10000.0
인천광역시,2020,403030,40300,403,10000.0
대구광역시,2010,2020202,202002,2020,10000.09901
대구광역시,2020,2010207,201007,2010,10000.348259
대전광역시,2010,2010101,201101,2001,10050.024988
대전광역시,2020,4212334,421234,4214,9996.06075


In [97]:
# korea_mdf['서울특별시':'인천광역시'] # 정렬을 해줘야함.

korea_mdf = korea_mdf.sort_index()
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남여비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1010202,100202,1002,10000.199601
광주광역시,2020,9998777,999877,9997,10001.770531
대구광역시,2010,2020202,202002,2020,10000.09901
대구광역시,2020,2010207,201007,2010,10000.348259
대전광역시,2010,2010101,201101,2001,10050.024988
대전광역시,2020,4212334,421234,4214,9996.06075
부산광역시,2010,200000,20000,9000,222.222222
부산광역시,2020,300000,30000,9000,333.333333
서울특별시,2010,100000,10000,100,10000.0
서울특별시,2020,200000,20000,200,10000.0


In [98]:
korea_mdf['서울특별시':'인천광역시']

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남여비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
서울특별시,2010,100000,10000,100,10000.0
서울특별시,2020,200000,20000,200,10000.0
인천광역시,2010,400000,40000,400,10000.0
인천광역시,2020,403030,40300,403,10000.0


In [99]:
korea_mdf.unstack(level=0)

Unnamed: 0_level_0,총인구수,총인구수,총인구수,총인구수,총인구수,총인구수,남자인구수,남자인구수,남자인구수,남자인구수,...,여자인구수,여자인구수,여자인구수,여자인구수,남여비율,남여비율,남여비율,남여비율,남여비율,남여비율
행정구역,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,인천광역시,광주광역시,대구광역시,대전광역시,부산광역시,...,대전광역시,부산광역시,서울특별시,인천광역시,광주광역시,대구광역시,대전광역시,부산광역시,서울특별시,인천광역시
년도,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010,1010202,2020202,2010101,200000,100000,400000,100202,202002,201101,20000,...,2001,9000,100,400,10000.199601,10000.09901,10050.024988,222.222222,10000.0,10000.0
2020,9998777,2010207,4212334,300000,200000,403030,999877,201007,421234,30000,...,4214,9000,200,403,10001.770531,10000.348259,9996.06075,333.333333,10000.0,10000.0


In [100]:
korea_mdf.unstack(level=1)

Unnamed: 0_level_0,총인구수,총인구수,남자인구수,남자인구수,여자인구수,여자인구수,남여비율,남여비율
년도,2010,2020,2010,2020,2010,2020,2010,2020
행정구역,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
광주광역시,1010202,9998777,100202,999877,1002,9997,10000.199601,10001.770531
대구광역시,2020202,2010207,202002,201007,2020,2010,10000.09901,10000.348259
대전광역시,2010101,4212334,201101,421234,2001,4214,10050.024988,9996.06075
부산광역시,200000,300000,20000,30000,9000,9000,222.222222,333.333333
서울특별시,100000,200000,10000,20000,100,200,10000.0,10000.0
인천광역시,400000,403030,40000,40300,400,403,10000.0,10000.0


In [101]:
korea_mdf.stack()

행정구역   년도         
광주광역시  2010  총인구수     1.010202e+06
             남자인구수    1.002020e+05
             여자인구수    1.002000e+03
             남여비율     1.000020e+04
       2020  총인구수     9.998777e+06
             남자인구수    9.998770e+05
             여자인구수    9.997000e+03
             남여비율     1.000177e+04
대구광역시  2010  총인구수     2.020202e+06
             남자인구수    2.020020e+05
             여자인구수    2.020000e+03
             남여비율     1.000010e+04
       2020  총인구수     2.010207e+06
             남자인구수    2.010070e+05
             여자인구수    2.010000e+03
             남여비율     1.000035e+04
대전광역시  2010  총인구수     2.010101e+06
             남자인구수    2.011010e+05
             여자인구수    2.001000e+03
             남여비율     1.005002e+04
       2020  총인구수     4.212334e+06
             남자인구수    4.212340e+05
             여자인구수    4.214000e+03
             남여비율     9.996061e+03
부산광역시  2010  총인구수     2.000000e+05
             남자인구수    2.000000e+04
             여자인구수    9.000000e+03
             남여비율     2.222222e+02
 

In [102]:
korea_mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남여비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1010202,100202,1002,10000.199601
광주광역시,2020,9998777,999877,9997,10001.770531
대구광역시,2010,2020202,202002,2020,10000.09901
대구광역시,2020,2010207,201007,2010,10000.348259
대전광역시,2010,2010101,201101,2001,10050.024988
대전광역시,2020,4212334,421234,4214,9996.06075
부산광역시,2010,200000,20000,9000,222.222222
부산광역시,2020,300000,30000,9000,333.333333
서울특별시,2010,100000,10000,100,10000.0
서울특별시,2020,200000,20000,200,10000.0


In [103]:
idx_flat = korea_mdf.reset_index(level=0)
idx_flat

Unnamed: 0_level_0,행정구역,총인구수,남자인구수,여자인구수,남여비율
년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010,광주광역시,1010202,100202,1002,10000.199601
2020,광주광역시,9998777,999877,9997,10001.770531
2010,대구광역시,2020202,202002,2020,10000.09901
2020,대구광역시,2010207,201007,2010,10000.348259
2010,대전광역시,2010101,201101,2001,10050.024988
2020,대전광역시,4212334,421234,4214,9996.06075
2010,부산광역시,200000,20000,9000,222.222222
2020,부산광역시,300000,30000,9000,333.333333
2010,서울특별시,100000,10000,100,10000.0
2020,서울특별시,200000,20000,200,10000.0


In [104]:
idx_flat = korea_mdf.reset_index(level=(0,1))
idx_flat

Unnamed: 0,행정구역,년도,총인구수,남자인구수,여자인구수,남여비율
0,광주광역시,2010,1010202,100202,1002,10000.199601
1,광주광역시,2020,9998777,999877,9997,10001.770531
2,대구광역시,2010,2020202,202002,2020,10000.09901
3,대구광역시,2020,2010207,201007,2010,10000.348259
4,대전광역시,2010,2010101,201101,2001,10050.024988
5,대전광역시,2020,4212334,421234,4214,9996.06075
6,부산광역시,2010,200000,20000,9000,222.222222
7,부산광역시,2020,300000,30000,9000,333.333333
8,서울특별시,2010,100000,10000,100,10000.0
9,서울특별시,2020,200000,20000,200,10000.0


In [105]:
idx_flat.set_index(['행정구역','년도'])

Unnamed: 0_level_0,Unnamed: 1_level_0,총인구수,남자인구수,여자인구수,남여비율
행정구역,년도,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
광주광역시,2010,1010202,100202,1002,10000.199601
광주광역시,2020,9998777,999877,9997,10001.770531
대구광역시,2010,2020202,202002,2020,10000.09901
대구광역시,2020,2010207,201007,2010,10000.348259
대전광역시,2010,2010101,201101,2001,10050.024988
대전광역시,2020,4212334,421234,4214,9996.06075
부산광역시,2010,200000,20000,9000,222.222222
부산광역시,2020,300000,30000,9000,333.333333
서울특별시,2010,100000,10000,100,10000.0
서울특별시,2020,200000,20000,200,10000.0


## 데이터 연산

In [106]:
s = pd.Series(np.random.randint(0,10,5))
s

0    1
1    6
2    5
3    5
4    7
dtype: int32

In [107]:
df = pd.DataFrame(np.random.randint(0,10,(3,3)),
                 columns = ['a','b','c'])
df

Unnamed: 0,a,b,c
0,7,1,6
1,7,3,1
2,9,3,0


In [108]:
np.exp(s)

0       2.718282
1     403.428793
2     148.413159
3     148.413159
4    1096.633158
dtype: float64

In [109]:
np.cos(df * np.pi / 4)

Unnamed: 0,a,b,c
0,0.707107,0.707107,-1.83697e-16
1,0.707107,-0.707107,0.7071068
2,0.707107,-0.707107,1.0


In [110]:
s1 = pd.Series([1,3,5,7,9], index=[0,1,2,3,4])
s2 = pd.Series([2,4,6,8,10], index=[1,2,3,4,5])
s1 + s2

0     NaN
1     5.0
2     9.0
3    13.0
4    17.0
5     NaN
dtype: float64

In [111]:
s1.add(s2, fill_value=0) # s1, s2 중 하나만있어도 값이 나오게 함.

0     1.0
1     5.0
2     9.0
3    13.0
4    17.0
5    10.0
dtype: float64

In [112]:
df1 = pd.DataFrame(np.random.randint(0,20,(3,3)),
                  columns=list('acd'))
df1

Unnamed: 0,a,c,d
0,17,4,1
1,7,15,4
2,11,6,16


In [113]:
df2 = pd.DataFrame(np.random.randint(0,20,(5,5)),
                  columns=list('baecd'))
df2

Unnamed: 0,b,a,e,c,d
0,6,10,4,15,17
1,14,5,2,18,0
2,18,16,12,11,4
3,9,14,0,11,16
4,17,5,17,0,14


In [114]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,27.0,,19.0,18.0,
1,12.0,,33.0,4.0,
2,27.0,,17.0,20.0,
3,,,,,
4,,,,,


In [115]:
fvalue = df1.stack().mean()
df1.add(df2, fill_value=fvalue)

Unnamed: 0,a,b,c,d,e
0,27.0,15.0,19.0,18.0,13.0
1,12.0,23.0,33.0,4.0,11.0
2,27.0,27.0,17.0,20.0,21.0
3,23.0,18.0,20.0,25.0,9.0
4,14.0,26.0,9.0,23.0,26.0


### 연산자 범용 함수


#### add()

In [116]:
a = np.random.randint(1,10, size=(3,3))
a

array([[6, 3, 6],
       [7, 3, 5],
       [4, 5, 9]])

In [117]:
a + a[0]

array([[12,  6, 12],
       [13,  6, 11],
       [10,  8, 15]])

In [118]:
df = pd.DataFrame(a, columns = list('abc'))
df

Unnamed: 0,a,b,c
0,6,3,6
1,7,3,5
2,4,5,9


In [119]:
df + df.iloc[0]

Unnamed: 0,a,b,c
0,12,6,12
1,13,6,11
2,10,8,15


In [120]:
df.add(df.iloc[0])

Unnamed: 0,a,b,c
0,12,6,12
1,13,6,11
2,10,8,15


#### sub() / subtract()

In [121]:
a

array([[6, 3, 6],
       [7, 3, 5],
       [4, 5, 9]])

In [122]:
a - a[0]

array([[ 0,  0,  0],
       [ 1,  0, -1],
       [-2,  2,  3]])

In [123]:
df

Unnamed: 0,a,b,c
0,6,3,6
1,7,3,5
2,4,5,9


In [124]:
df - df.iloc[0]

Unnamed: 0,a,b,c
0,0,0,0
1,1,0,-1
2,-2,2,3


In [125]:
df.sub(df.iloc[0])

Unnamed: 0,a,b,c
0,0,0,0
1,1,0,-1
2,-2,2,3


In [126]:
df.subtract(df['b'], axis=0)

Unnamed: 0,a,b,c
0,3,0,3
1,4,0,2
2,-1,0,4


#### mul() / multply()




In [127]:
a

array([[6, 3, 6],
       [7, 3, 5],
       [4, 5, 9]])

In [128]:
a * a[1]

array([[42,  9, 30],
       [49,  9, 25],
       [28, 15, 45]])

In [129]:
df

Unnamed: 0,a,b,c
0,6,3,6
1,7,3,5
2,4,5,9


In [130]:
df * df.iloc[1]

Unnamed: 0,a,b,c
0,42,9,30
1,49,9,25
2,28,15,45


In [131]:
df.mul(df.iloc[1])

Unnamed: 0,a,b,c
0,42,9,30
1,49,9,25
2,28,15,45


In [132]:
df.multiply(df.iloc[2])

Unnamed: 0,a,b,c
0,24,15,54
1,28,15,45
2,16,25,81


#### truediv() /  div() / divide() / floordiv()

In [133]:
a

array([[6, 3, 6],
       [7, 3, 5],
       [4, 5, 9]])

In [134]:
a / a[0]

array([[1.        , 1.        , 1.        ],
       [1.16666667, 1.        , 0.83333333],
       [0.66666667, 1.66666667, 1.5       ]])

In [135]:
df / df.iloc[0]

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,1.166667,1.0,0.833333
2,0.666667,1.666667,1.5


In [136]:
df.truediv(df.iloc[0])

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,1.166667,1.0,0.833333
2,0.666667,1.666667,1.5


In [137]:
df.divide(df.iloc[2])

Unnamed: 0,a,b,c
0,1.5,0.6,0.666667
1,1.75,0.6,0.555556
2,1.0,1.0,1.0


In [138]:
a // a[0]

array([[1, 1, 1],
       [1, 1, 0],
       [0, 1, 1]], dtype=int32)

In [139]:
df.floordiv(df.iloc[0])

Unnamed: 0,a,b,c
0,1,1,1
1,1,1,0
2,0,1,1


#### mod()

In [140]:
a

array([[6, 3, 6],
       [7, 3, 5],
       [4, 5, 9]])

In [141]:
a%a[0]

array([[0, 0, 0],
       [1, 0, 5],
       [4, 2, 3]], dtype=int32)

In [142]:
df

Unnamed: 0,a,b,c
0,6,3,6
1,7,3,5
2,4,5,9


In [143]:
df.mod(df.iloc[0])

Unnamed: 0,a,b,c
0,0,0,0
1,1,0,5
2,4,2,3


#### pow()

In [144]:
a

array([[6, 3, 6],
       [7, 3, 5],
       [4, 5, 9]])

In [145]:
a ** a[0]

array([[ 46656,     27,  46656],
       [117649,     27,  15625],
       [  4096,    125, 531441]], dtype=int32)

In [146]:
df

Unnamed: 0,a,b,c
0,6,3,6
1,7,3,5
2,4,5,9


In [147]:
df.pow(df.iloc[0])

Unnamed: 0,a,b,c
0,46656,27,46656
1,117649,27,15625
2,4096,125,531441


In [148]:
row = df.iloc[0,::2]
row

a    6
c    6
Name: 0, dtype: int32

In [149]:
df - row

Unnamed: 0,a,b,c
0,0.0,,0.0
1,1.0,,-1.0
2,-2.0,,3.0


### 정렬(Sort)

In [150]:
s = pd.Series(range(5), index=['a','d','b','c','e'])
s

a    0
d    1
b    2
c    3
e    4
dtype: int64

In [151]:
s.sort_index()

a    0
b    2
c    3
d    1
e    4
dtype: int64

In [152]:
s.sort_values()

a    0
d    1
b    2
c    3
e    4
dtype: int64

In [153]:
df = pd.DataFrame(np.random.randint(0,10,(4,4)),
                 index=[2,4,1,3],
                 columns=list('bdac'))
df

Unnamed: 0,b,d,a,c
2,6,1,3,9
4,1,1,2,5
1,2,8,2,6
3,8,5,1,6


In [154]:
df.sort_index()

Unnamed: 0,b,d,a,c
1,2,8,2,6
2,6,1,3,9
3,8,5,1,6
4,1,1,2,5


In [155]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
2,3,6,9,1
4,2,1,5,1
1,2,2,6,8
3,1,8,6,5


In [156]:
df.sort_values(by='a')

Unnamed: 0,b,d,a,c
3,8,5,1,6
4,1,1,2,5
1,2,8,2,6
2,6,1,3,9


In [157]:
df.sort_values(by=['a','c'])

Unnamed: 0,b,d,a,c
3,8,5,1,6
4,1,1,2,5
1,2,8,2,6
2,6,1,3,9


### 순위(Ranking)


In [158]:
s = pd.Series([-2,4,7,3,0,7,5,-4,2,6])
s

0   -2
1    4
2    7
3    3
4    0
5    7
6    5
7   -4
8    2
9    6
dtype: int64

In [159]:
s.rank()

0    2.0
1    6.0
2    9.5
3    5.0
4    3.0
5    9.5
6    7.0
7    1.0
8    4.0
9    8.0
dtype: float64

In [160]:
s.rank(method='first')

0     2.0
1     6.0
2     9.0
3     5.0
4     3.0
5    10.0
6     7.0
7     1.0
8     4.0
9     8.0
dtype: float64

In [161]:
s.rank(method='max')

0     2.0
1     6.0
2    10.0
3     5.0
4     3.0
5    10.0
6     7.0
7     1.0
8     4.0
9     8.0
dtype: float64

### 고성능 연산(eval, query)

In [162]:
nrows, ncols = 10000, 100
df1, df2, df3, df4 = (pd.DataFrame(np.random.rand(nrows, ncols)) for i in range(4))

In [163]:
%timeit df1 + df2 + df3 + df4

7.61 ms ± 405 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [164]:
%timeit pd.eval('df1 + df2 + df3 + df4')

4.27 ms ± 93.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [165]:
%timeit df1 * -df2 / (-df3 * df4)

11.8 ms ± 75.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [166]:
%timeit pd.eval('df1 * -df2 / (-df3 * df4)')

4.49 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [167]:
%timeit (df1 < df2) & (df2 <= df3) & (df3 !=df4)

5.85 ms ± 538 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [168]:
%timeit pd.eval('(df1 < df2) & (df2 <= df3) & (df3 !=df4)')

4.26 ms ± 92.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [169]:
df = pd.DataFrame(np.random.rand(1000000,5), columns = ['a','b','c','d','e'])
df.head()

Unnamed: 0,a,b,c,d,e
0,0.324513,0.9813,0.146694,0.740812,0.879683
1,0.032261,0.536478,0.200518,0.678565,0.309724
2,0.272849,0.547912,0.076977,0.917584,0.46663
3,0.694479,0.21403,0.218744,0.111508,0.07075
4,0.66946,0.6211,0.483179,0.432387,0.844562


In [170]:
%timeit df['a'] + df['b'] / df['c'] - df['d'] * df['e']

15.6 ms ± 1.81 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [171]:
%timeit pd.eval('df.a + df.b / df.c - df.d * df.e')

5.72 ms ± 89.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [172]:
%timeit df.eval('a + b / c - d * e')

10.7 ms ± 211 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [173]:
df.eval('r = a + b / c - d * e',inplace=True)
df.head()

Unnamed: 0,a,b,c,d,e,r
0,0.324513,0.9813,0.146694,0.740812,0.879683,6.362257
1,0.032261,0.536478,0.200518,0.678565,0.309724,2.49755
2,0.272849,0.547912,0.076977,0.917584,0.46663,6.962552
3,0.694479,0.21403,0.218744,0.111508,0.07075,1.665043
4,0.66946,0.6211,0.483179,0.432387,0.844562,1.589729


In [175]:
col_mean = df.mean(1)
df['a'] + col_mean

0         1.897057
1         0.741444
2         1.813599
3         1.190238
4         1.442863
            ...   
999995    1.316988
999996    3.845501
999997    0.769654
999998    1.094536
999999    0.800619
Length: 1000000, dtype: float64

In [176]:
df.eval('a + @col_mean')

0         1.897057
1         0.741444
2         1.813599
3         1.190238
4         1.442863
            ...   
999995    1.316988
999996    3.845501
999997    0.769654
999998    1.094536
999999    0.800619
Length: 1000000, dtype: float64

In [177]:
df[(df.a < 0.5 ) & (df.b < 0.5) & (df.c > 0.5)]

Unnamed: 0,a,b,c,d,e,r
7,0.000203,0.275657,0.511746,0.777802,0.847280,-0.120153
10,0.441214,0.180447,0.844174,0.768968,0.188667,0.509890
30,0.309171,0.376721,0.926108,0.104057,0.868509,0.625575
31,0.036658,0.325236,0.592668,0.509157,0.983854,0.084487
41,0.119798,0.377044,0.514189,0.713841,0.594458,0.428728
...,...,...,...,...,...,...
999935,0.016681,0.123358,0.788095,0.646211,0.196306,0.046352
999947,0.055110,0.338223,0.912682,0.390614,0.411103,0.265109
999962,0.134500,0.170152,0.526387,0.265751,0.950757,0.205080
999988,0.291257,0.140974,0.522645,0.761549,0.737715,-0.000817


In [178]:
pd.eval('df[(df.a < 0.5 ) & (df.b < 0.5) & (df.c > 0.5)]')

Unnamed: 0,a,b,c,d,e,r
7,0.000203,0.275657,0.511746,0.777802,0.847280,-0.120153
10,0.441214,0.180447,0.844174,0.768968,0.188667,0.509890
30,0.309171,0.376721,0.926108,0.104057,0.868509,0.625575
31,0.036658,0.325236,0.592668,0.509157,0.983854,0.084487
41,0.119798,0.377044,0.514189,0.713841,0.594458,0.428728
...,...,...,...,...,...,...
999935,0.016681,0.123358,0.788095,0.646211,0.196306,0.046352
999947,0.055110,0.338223,0.912682,0.390614,0.411103,0.265109
999962,0.134500,0.170152,0.526387,0.265751,0.950757,0.205080
999988,0.291257,0.140974,0.522645,0.761549,0.737715,-0.000817


In [180]:
df.query('(a < 0.5) and (b < 0.5) and (c > 0.5)')

Unnamed: 0,a,b,c,d,e,r
7,0.000203,0.275657,0.511746,0.777802,0.847280,-0.120153
10,0.441214,0.180447,0.844174,0.768968,0.188667,0.509890
30,0.309171,0.376721,0.926108,0.104057,0.868509,0.625575
31,0.036658,0.325236,0.592668,0.509157,0.983854,0.084487
41,0.119798,0.377044,0.514189,0.713841,0.594458,0.428728
...,...,...,...,...,...,...
999935,0.016681,0.123358,0.788095,0.646211,0.196306,0.046352
999947,0.055110,0.338223,0.912682,0.390614,0.411103,0.265109
999962,0.134500,0.170152,0.526387,0.265751,0.950757,0.205080
999988,0.291257,0.140974,0.522645,0.761549,0.737715,-0.000817


In [181]:
col_mean = df['d'].mean()
df[(df.a < col_mean) & (df.b < col_mean)]

Unnamed: 0,a,b,c,d,e,r
6,0.373876,0.455643,0.118243,0.446538,0.064860,4.198354
7,0.000203,0.275657,0.511746,0.777802,0.847280,-0.120153
10,0.441214,0.180447,0.844174,0.768968,0.188667,0.509890
11,0.040775,0.037103,0.390413,0.164027,0.152431,0.110806
21,0.492143,0.184041,0.037820,0.483285,0.741904,4.999783
...,...,...,...,...,...,...
999989,0.451339,0.012319,0.119308,0.865681,0.214441,0.368952
999990,0.404735,0.039465,0.000832,0.670600,0.979463,47.159333
999995,0.219086,0.355704,0.064821,0.195660,0.056682,5.695455
999997,0.289613,0.241620,0.191625,0.520742,0.179722,1.456923


In [182]:
df.query('a < @col_mean and b < @col_mean')

Unnamed: 0,a,b,c,d,e,r
6,0.373876,0.455643,0.118243,0.446538,0.064860,4.198354
7,0.000203,0.275657,0.511746,0.777802,0.847280,-0.120153
10,0.441214,0.180447,0.844174,0.768968,0.188667,0.509890
11,0.040775,0.037103,0.390413,0.164027,0.152431,0.110806
21,0.492143,0.184041,0.037820,0.483285,0.741904,4.999783
...,...,...,...,...,...,...
999989,0.451339,0.012319,0.119308,0.865681,0.214441,0.368952
999990,0.404735,0.039465,0.000832,0.670600,0.979463,47.159333
999995,0.219086,0.355704,0.064821,0.195660,0.056682,5.695455
999997,0.289613,0.241620,0.191625,0.520742,0.179722,1.456923


## 데이터 결합

### Concat() / Append()

In [183]:
s1 = pd.Series(['a','b'], index=[1,2])
s2 = pd.Series(['c','d'], index=[3,4])
pd.concat([s1, s2])

1    a
2    b
3    c
4    d
dtype: object

In [184]:
def create_df(cols, idx):
    data = {c : [str(c.lower()) + str(i) for i in idx] for c in cols}
    return pd.DataFrame(data, idx)

In [187]:
df1 = create_df ('ab', [1,2])
df1

Unnamed: 0,a,b
1,a1,b1
2,a2,b2


In [188]:
df2 = create_df('ab',[3,4])
df2

Unnamed: 0,a,b
3,a3,b3
4,a4,b4


In [189]:
pd.concat([df1,df2])

Unnamed: 0,a,b
1,a1,b1
2,a2,b2
3,a3,b3
4,a4,b4


In [190]:
df3 = create_df('ab',[0,1])
df3

Unnamed: 0,a,b
0,a0,b0
1,a1,b1


In [191]:
df4 = create_df('cd',[0,1])
df4

Unnamed: 0,c,d
0,c0,d0
1,c1,d1


In [192]:
pd.concat([df3,df4])

Unnamed: 0,a,b,c,d
0,a0,b0,,
1,a1,b1,,
0,,,c0,d0
1,,,c1,d1


In [199]:
pd.concat([df3,df4], axis=1)

Unnamed: 0,a,b,c,d
0,a0,b0,c0,d0
1,a1,b1,c1,d1


In [193]:
# pd.concat([df1, df3], verify_integrity=True)

In [200]:
pd.concat([df1,df3])

Unnamed: 0,a,b
1,a1,b1
2,a2,b2
0,a0,b0
1,a1,b1


In [194]:
pd.concat([df1,df3], ignore_index=True)

Unnamed: 0,a,b
0,a1,b1
1,a2,b2
2,a0,b0
3,a1,b1


In [195]:
pd.concat([df1,df3], keys=['X','Y'])

Unnamed: 0,Unnamed: 1,a,b
X,1,a1,b1
X,2,a2,b2
Y,0,a0,b0
Y,1,a1,b1


In [196]:
df5 = create_df('abc',[1,2])
df6 = create_df('bcd', [3,4])
pd.concat([df5,df6])

Unnamed: 0,a,b,c,d
1,a1,b1,c1,
2,a2,b2,c2,
3,,b3,c3,d3
4,,b4,c4,d4


In [197]:
pd.concat([df5,df6], join='inner')

Unnamed: 0,b,c
1,b1,c1
2,b2,c2
3,b3,c3
4,b4,c4


In [198]:
df5.append(df6)

Unnamed: 0,a,b,c,d
1,a1,b1,c1,
2,a2,b2,c2,
3,,b3,c3,d3
4,,b4,c4,d4


### 병합과 조인

In [201]:
df1 = pd.DataFrame({'학생':['홍길동','이순신','임꺽정','김유신'],
                    '학과': ['경영','교육','컴퓨터','통계']})
df1

Unnamed: 0,학생,학과
0,홍길동,경영
1,이순신,교육
2,임꺽정,컴퓨터
3,김유신,통계


In [202]:
df2 = pd.DataFrame({'학생':['홍길동','이순신','임꺽정','김유신'],
                   '입학년도':[2012,2016,2019,2020]})
df2

Unnamed: 0,학생,입학년도
0,홍길동,2012
1,이순신,2016
2,임꺽정,2019
3,김유신,2020


In [203]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,학생,학과,입학년도
0,홍길동,경영,2012
1,이순신,교육,2016
2,임꺽정,컴퓨터,2019
3,김유신,통계,2020


In [204]:
df4 = pd.DataFrame({'학과':['경영','교육','컴퓨터','통계'],
                   '학과장': ['황희','장영실','안창호','정약용']})
df4

Unnamed: 0,학과,학과장
0,경영,황희
1,교육,장영실
2,컴퓨터,안창호
3,통계,정약용


In [205]:
pd.merge(df3,df4)

Unnamed: 0,학생,학과,입학년도,학과장
0,홍길동,경영,2012,황희
1,이순신,교육,2016,장영실
2,임꺽정,컴퓨터,2019,안창호
3,김유신,통계,2020,정약용


In [207]:
df5 = pd.DataFrame({'학과':['경영','교육','교육','컴퓨터','컴퓨터','통계'],
                   '과목':['경영개론','기초수학','물리학','프로그래밍','운영체제','확률론']})
df5

Unnamed: 0,학과,과목
0,경영,경영개론
1,교육,기초수학
2,교육,물리학
3,컴퓨터,프로그래밍
4,컴퓨터,운영체제
5,통계,확률론


In [208]:
pd.merge(df1,df5)

Unnamed: 0,학생,학과,과목
0,홍길동,경영,경영개론
1,이순신,교육,기초수학
2,이순신,교육,물리학
3,임꺽정,컴퓨터,프로그래밍
4,임꺽정,컴퓨터,운영체제
5,김유신,통계,확률론


In [209]:
pd.merge(df1,df2, on='학생')

Unnamed: 0,학생,학과,입학년도
0,홍길동,경영,2012
1,이순신,교육,2016
2,임꺽정,컴퓨터,2019
3,김유신,통계,2020


In [210]:
df6 = pd.DataFrame({'이름':['홍길동','이순신','임꺽정','김유신'],
                   '성적':['a','a+','b','a+']})
df6

Unnamed: 0,이름,성적
0,홍길동,a
1,이순신,a+
2,임꺽정,b
3,김유신,a+


In [211]:
pd.merge(df1, df6, left_on='학생', right_on='이름')

Unnamed: 0,학생,학과,이름,성적
0,홍길동,경영,홍길동,a
1,이순신,교육,이순신,a+
2,임꺽정,컴퓨터,임꺽정,b
3,김유신,통계,김유신,a+


In [214]:
pd.merge(df1, df6, left_on='학생', right_on='이름').drop('이름',axis=1)

Unnamed: 0,학생,학과,성적
0,홍길동,경영,a
1,이순신,교육,a+
2,임꺽정,컴퓨터,b
3,김유신,통계,a+


In [215]:
mdf1 = df1.set_index('학생')
mdf2 = df2.set_index('학생')

In [217]:
mdf1

Unnamed: 0_level_0,학과
학생,Unnamed: 1_level_1
홍길동,경영
이순신,교육
임꺽정,컴퓨터
김유신,통계


In [218]:
mdf2

Unnamed: 0_level_0,입학년도
학생,Unnamed: 1_level_1
홍길동,2012
이순신,2016
임꺽정,2019
김유신,2020


In [219]:
pd.merge(mdf1,mdf2,left_index=True, right_index=True)

Unnamed: 0_level_0,학과,입학년도
학생,Unnamed: 1_level_1,Unnamed: 2_level_1
홍길동,경영,2012
이순신,교육,2016
임꺽정,컴퓨터,2019
김유신,통계,2020


In [220]:
mdf1.join(mdf2)

Unnamed: 0_level_0,학과,입학년도
학생,Unnamed: 1_level_1,Unnamed: 2_level_1
홍길동,경영,2012
이순신,교육,2016
임꺽정,컴퓨터,2019
김유신,통계,2020


In [221]:
pd.merge(mdf1,df6, left_index=True, right_on='이름')

Unnamed: 0,학과,이름,성적
0,경영,홍길동,a
1,교육,이순신,a+
2,컴퓨터,임꺽정,b
3,통계,김유신,a+


In [222]:
df7 = pd.DataFrame({'이름':['홍길동','이순신','임꺽정'],
                   '주문음식':['햄버거','피자','짜장면']})
df7

Unnamed: 0,이름,주문음식
0,홍길동,햄버거
1,이순신,피자
2,임꺽정,짜장면


In [223]:
df8 = pd.DataFrame({'이름':['홍길동','이순신','김유신'],
                   '주문음료':['콜라','사이다','커피']})
df8

Unnamed: 0,이름,주문음료
0,홍길동,콜라
1,이순신,사이다
2,김유신,커피


In [224]:
pd.merge(df7,df8)

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다


In [225]:
pd.merge(df7,df8, how='inner')

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다


In [226]:
pd.merge(df7,df8, how='outer')

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다
2,임꺽정,짜장면,
3,김유신,,커피


In [227]:
pd.merge(df7, df8, how='left')

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다
2,임꺽정,짜장면,


In [228]:
pd.merge(df7, df8, how='right')

Unnamed: 0,이름,주문음식,주문음료
0,홍길동,햄버거,콜라
1,이순신,피자,사이다
2,김유신,,커피


In [229]:
df9 = pd.DataFrame({'이름':['홍길동','이순신','임꺽정','김유신'],
                   '순위': [3,4,2,1]})
df9

Unnamed: 0,이름,순위
0,홍길동,3
1,이순신,4
2,임꺽정,2
3,김유신,1


In [230]:
df10 = pd.DataFrame({'이름':['홍길동','이순신','임꺽정','김유신'],
                   '순위': [1,2,3,4]})
df10

Unnamed: 0,이름,순위
0,홍길동,1
1,이순신,2
2,임꺽정,3
3,김유신,4


In [231]:
pd.merge(df9, df10, on='이름')

Unnamed: 0,이름,순위_x,순위_y
0,홍길동,3,1
1,이순신,4,2
2,임꺽정,2,3
3,김유신,1,4


In [232]:
pd.merge(df9, df10, on='이름', suffixes=['_인기','_성적'])

Unnamed: 0,이름,순위_인기,순위_성적
0,홍길동,3,1
1,이순신,4,2
2,임꺽정,2,3
3,김유신,1,4


## 데이터 집계와 그룹 연산

#### 집계 연산(Aggregation)


In [233]:
df = pd.DataFrame([[1,1.2,np.nan],
                  [2.4,5.5,4.2],
                  [np.nan,np.nan,np.nan],
                  [0.44,-3.1,-4.1]],
                 index=[1,2,3,4],
                 columns=['a','b','c'])
df

Unnamed: 0,a,b,c
1,1.0,1.2,
2,2.4,5.5,4.2
3,,,
4,0.44,-3.1,-4.1


In [234]:
df.head(2)

Unnamed: 0,a,b,c
1,1.0,1.2,
2,2.4,5.5,4.2


In [235]:
df.tail(2)

Unnamed: 0,a,b,c
3,,,
4,0.44,-3.1,-4.1


In [236]:
df.describe()

Unnamed: 0,a,b,c
count,3.0,3.0,2.0
mean,1.28,1.2,0.05
std,1.009554,4.3,5.868986
min,0.44,-3.1,-4.1
25%,0.72,-0.95,-2.025
50%,1.0,1.2,0.05
75%,1.7,3.35,2.125
max,2.4,5.5,4.2


In [237]:
print(df)
print(np.argmin(df), np.argmax(df))

      a    b    c
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
2 2


In [238]:
print(df)
print(df.idxmin())
print(df.idxmax())

      a    b    c
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
a    4
b    4
c    4
dtype: int64
a    2
b    2
c    2
dtype: int64


In [239]:
print(df)
print(df.std())
print(df.var())

      a    b    c
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
a    1.009554
b    4.300000
c    5.868986
dtype: float64
a     1.0192
b    18.4900
c    34.4450
dtype: float64


In [240]:
print(df)
print(df.skew())
print(df.kurt())

      a    b    c
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
a    1.15207
b    0.00000
c        NaN
dtype: float64
a   NaN
b   NaN
c   NaN
dtype: float64


In [241]:
print(df)
print(df.sum())
print(df.cumsum())

      a    b    c
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
a    3.84
b    3.60
c    0.10
dtype: float64
      a    b    c
1  1.00  1.2  NaN
2  3.40  6.7  4.2
3   NaN  NaN  NaN
4  3.84  3.6  0.1


In [242]:
print(df)
print(df.prod())
print(df.cumprod())

      a    b    c
1  1.00  1.2  NaN
2  2.40  5.5  4.2
3   NaN  NaN  NaN
4  0.44 -3.1 -4.1
a     1.056
b   -20.460
c   -17.220
dtype: float64
       a      b      c
1  1.000   1.20    NaN
2  2.400   6.60   4.20
3    NaN    NaN    NaN
4  1.056 -20.46 -17.22


In [243]:
df.diff()

Unnamed: 0,a,b,c
1,,,
2,1.4,4.3,
3,,,
4,,,


In [244]:
df.corr()

Unnamed: 0,a,b,c
a,1.0,0.970725,1.0
b,0.970725,1.0,1.0
c,1.0,1.0,1.0


In [245]:
df.corrwith(df.b)

a    0.970725
b    1.000000
c    1.000000
dtype: float64

In [246]:
df.cov()

Unnamed: 0,a,b,c
a,1.0192,4.214,8.134
b,4.214,18.49,35.69
c,8.134,35.69,34.445


In [247]:
df['b'].unique()

array([ 1.2,  5.5,  nan, -3.1])

In [248]:
df['a'].value_counts()

1.00    1
2.40    1
0.44    1
Name: a, dtype: int64

### GroupBy 연산

In [249]:
df = pd.DataFrame({'c1':['a','a','b','b','c','d','b'],
                  'c2':['A','B','B','A','D','C','C'],
                  'c3': np.random.randint(7),
                  'c4': np.random.random(7)})
df

Unnamed: 0,c1,c2,c3,c4
0,a,A,1,0.900004
1,a,B,1,0.541
2,b,B,1,0.175652
3,b,A,1,0.765344
4,c,D,1,0.405133
5,d,C,1,0.694604
6,b,C,1,0.307875


In [250]:
df.dtypes

c1     object
c2     object
c3      int64
c4    float64
dtype: object

In [251]:
df['c3'].groupby(df['c1']).mean()

c1
a    1.0
b    1.0
c    1.0
d    1.0
Name: c3, dtype: float64

In [253]:
df['c4'].groupby(df['c2']).std()

c2
A    0.095219
B    0.258340
C    0.273458
D         NaN
Name: c4, dtype: float64

In [254]:
df['c4'].groupby([df['c1'], df['c2']]).mean().unstack()

c2,A,B,C,D
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,0.900004,0.541,,
b,0.765344,0.175652,0.307875,
c,,,,0.405133
d,,,0.694604,


In [255]:
df.groupby('c1').mean()

Unnamed: 0_level_0,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.0,0.720502
b,1.0,0.41629
c,1.0,0.405133
d,1.0,0.694604


In [256]:
df.groupby(['c1','c2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,c3,c4
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,A,1.0,0.900004
a,B,1.0,0.541
b,A,1.0,0.765344
b,B,1.0,0.175652
b,C,1.0,0.307875
c,D,1.0,0.405133
d,C,1.0,0.694604


In [257]:
df.groupby(['c1','c2']).size()

c1  c2
a   A     1
    B     1
b   A     1
    B     1
    C     1
c   D     1
d   C     1
dtype: int64

In [258]:
for c1, group in df.groupby('c1'):
    print(c1)
    print(group)

a
  c1 c2  c3        c4
0  a  A   1  0.900004
1  a  B   1  0.541000
b
  c1 c2  c3        c4
2  b  B   1  0.175652
3  b  A   1  0.765344
6  b  C   1  0.307875
c
  c1 c2  c3        c4
4  c  D   1  0.405133
d
  c1 c2  c3        c4
5  d  C   1  0.694604


In [260]:
for (c1,c2), group in df.groupby(['c1','c2']):
    print((c1,c2))
    print(group)

('a', 'A')
  c1 c2  c3        c4
0  a  A   1  0.900004
('a', 'B')
  c1 c2  c3     c4
1  a  B   1  0.541
('b', 'A')
  c1 c2  c3        c4
3  b  A   1  0.765344
('b', 'B')
  c1 c2  c3        c4
2  b  B   1  0.175652
('b', 'C')
  c1 c2  c3        c4
6  b  C   1  0.307875
('c', 'D')
  c1 c2  c3        c4
4  c  D   1  0.405133
('d', 'C')
  c1 c2  c3        c4
5  d  C   1  0.694604


In [261]:
df.groupby(['c1','c2'])[['c4']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,c4
c1,c2,Unnamed: 2_level_1
a,A,0.900004
a,B,0.541
b,A,0.765344
b,B,0.175652
b,C,0.307875
c,D,0.405133
d,C,0.694604


In [262]:
df.groupby('c1')['c3'].quantile()

c1
a    1.0
b    1.0
c    1.0
d    1.0
Name: c3, dtype: float64

In [263]:
df.groupby('c1')['c3'].count()

c1
a    2
b    3
c    1
d    1
Name: c3, dtype: int64

In [264]:
df.groupby('c1')['c4'].median()

c1
a    0.720502
b    0.307875
c    0.405133
d    0.694604
Name: c4, dtype: float64

In [265]:
df.groupby('c1')['c3'].std()

c1
a    0.0
b    0.0
c    NaN
d    NaN
Name: c3, dtype: float64

In [266]:
df.groupby(['c1','c2'])['c4'].agg(['mean','min','max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min,max
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,A,0.900004,0.900004,0.900004
a,B,0.541,0.541,0.541
b,A,0.765344,0.765344,0.765344
b,B,0.175652,0.175652,0.175652
b,C,0.307875,0.307875,0.307875
c,D,0.405133,0.405133,0.405133
d,C,0.694604,0.694604,0.694604


In [267]:
df.groupby(['c1','c2'], as_index=False)['c4'].mean()

Unnamed: 0,c1,c2,c4
0,a,A,0.900004
1,a,B,0.541
2,b,A,0.765344
3,b,B,0.175652
4,b,C,0.307875
5,c,D,0.405133
6,d,C,0.694604


In [268]:
df.groupby(['c1','c2'], group_keys=False)['c4'].mean()

c1  c2
a   A     0.900004
    B     0.541000
b   A     0.765344
    B     0.175652
    C     0.307875
c   D     0.405133
d   C     0.694604
Name: c4, dtype: float64

In [269]:
def top(df, n=3, columns='c1'):
    return df.sort_values(by=columns)[-n:]

top(df,n=5)

Unnamed: 0,c1,c2,c3,c4
2,b,B,1,0.175652
3,b,A,1,0.765344
6,b,C,1,0.307875
4,c,D,1,0.405133
5,d,C,1,0.694604


In [270]:
df.groupby('c1').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c2,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,0,a,A,1,0.900004
a,1,a,B,1,0.541
b,2,b,B,1,0.175652
b,3,b,A,1,0.765344
b,6,b,C,1,0.307875
c,4,c,D,1,0.405133
d,5,d,C,1,0.694604


### 피벗 테이블(Pivot Table)


In [279]:
df

Unnamed: 0,c1,c2,c3,c4
0,a,A,1,0.900004
1,a,B,1,0.541
2,b,B,1,0.175652
3,b,A,1,0.765344
4,c,D,1,0.405133
5,d,C,1,0.694604
6,b,C,1,0.307875


In [271]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'])

Unnamed: 0_level_0,c3,c3,c3,c3,c4,c4,c4,c4
c2,A,B,C,D,A,B,C,D
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
a,1.0,1.0,,,0.900004,0.541,,
b,1.0,1.0,1.0,,0.765344,0.175652,0.307875,
c,,,,1.0,,,,0.405133
d,,,1.0,,,,0.694604,


In [272]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'],
              margins=True)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,1.0,1.0,,,1.0,0.900004,0.541,,,0.720502
b,1.0,1.0,1.0,,1.0,0.765344,0.175652,0.307875,,0.41629
c,,,,1.0,1.0,,,,0.405133,0.405133
d,,,1.0,,1.0,,,0.694604,,0.694604
All,1.0,1.0,1.0,1.0,1.0,0.832674,0.358326,0.50124,0.405133,0.541373


In [273]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'],
              margins=True,
              aggfunc=sum)



Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,1.0,1.0,,,2,0.900004,0.541,,,1.441004
b,1.0,1.0,1.0,,3,0.765344,0.175652,0.307875,,1.248871
c,,,,1.0,1,,,,0.405133,0.405133
d,,,1.0,,1,,,0.694604,,0.694604
All,2.0,2.0,2.0,1.0,7,1.665348,0.716652,1.00248,0.405133,3.789613


In [274]:
df.pivot_table(['c3','c4'],
              index=['c1'],
              columns=['c2'],
              margins=True,
              aggfunc=sum,
              fill_value=0)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,1,1,0,0,2,0.900004,0.541,0.0,0.0,1.441004
b,1,1,1,0,3,0.765344,0.175652,0.307875,0.0,1.248871
c,0,0,0,1,1,0.0,0.0,0.0,0.405133,0.405133
d,0,0,1,0,1,0.0,0.0,0.694604,0.0,0.694604
All,2,2,2,1,7,1.665348,0.716652,1.00248,0.405133,3.789613


In [276]:
pd.crosstab(df.c1,df.c2)

c2,A,B,C,D
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1,1,0,0
b,1,1,1,0
c,0,0,0,1
d,0,0,1,0


In [278]:
pd.crosstab(df.c1, df.c2, values=df.c3, aggfunc=sum, margins=True)

c2,A,B,C,D,All
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1.0,1.0,,,2
b,1.0,1.0,1.0,,3
c,,,,1.0,1
d,,,1.0,,1
All,2.0,2.0,2.0,1.0,7


### 범주형(Categorical) 데이터


## 문자열 연산

#### 문자열 연산자

#### 기타 연산자


#### 정규표현식


## 시계열 처리

#### 시계열 데이터 구조


### 시계열 기본

### 주기와 오프셋


### 시프트(Shift)

### 시간대 처리

* 국제표준시(Coordinated Universal Time, UTC)를 기준으로 떨어진 거리만큼 오프셋으로 시간대 처리
* 전 세계의 시간대 정보를 모아놓은 올슨 데이터베이스를 활용한 라이브러리인 `pytz` 사용

### 기간과 기간 연산

### 리샘플링(Resampling)

* 리샘플링(Resampling): 시계열의 빈도 변환
* 다운샘플링(Down sampling): 상위 빈도 데이터를 하위 빈도 데이터로 집계
* 업샘플링(Up sampling): 하위 빈도 데이터를 상위 빈도 데이터로 집계

### 무빙 윈도우(Moving Window)

## 데이터 읽기 및 저장


### 텍스트 파일 읽기/쓰기

### 이진 데이터 파일 읽기/쓰기

## 데이터 정제

### 누락값 처리

* 대부분의 실제 데이터들은 정제되지 않고 누락값들이 존재
* 서로 다른 데이터들은 다른 형태의 결측을 가짐
* 결측 데이터는 `null`, `NaN`, `NA`로 표기

#### None: 파이썬 누락 데이터

#### NaN: 누락된 수치 데이터

#### Null 값 처리


### 중복 제거

### 값 치환

## 참고문헌

* Pandas 사이트: https://pandas.pydata.org/
* Jake VanderPlas, "Python Data Science Handbook", O'Reilly
* Wes Mckinney, "Python for Data Analysis", O'Reilly