In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [26]:
import warnings
warnings.filterwarnings('ignore')

# pandas 데이터 재구조화(reshaping)

- 피벗팅(pivoting)
- 스태킹(stacking)과 언스태킹(unstacking)
- 멜팅(melting)과 와이드투롱(wide_to_long)
- 교차표(crosstab)
- explode

## 2. 스태킹(stacking)과 언스태깅(unstacking)

: 피벗팅과 유사하지만 계층형 인덱스의 특정 수준도 회전이 가능함

- 스태킹(stacking) : column labels과 그 값을 row index와 값으로 회전시킴
- 언스태킹(unstacking) : row index와 그 값이 column labels과 값으로 회전시킴

![image.png](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Ft1.daumcdn.net%2Fcfile%2Ftistory%2F99BBDC48601405E621)

- 출처 : https://rfriend.tistory.com/276

### **1. 스태킹(stacking)**

![image.png](https://pandas.pydata.org/docs/_images/reshaping_stack.png)

- **DataFrame.stack(level=- 1, dropna=True)**
    - level : int, str, list, default=-1
        - 언스태킹을 적용하는 레벨
        - 기본값은 마지막 레벨 : 언스태킹 결과 항상 마지막 레벨로 이동
    - dropna : bool, default True
        - 스태킹 결과 결측치 처리 여부, 기본값은 True로 결측치 제외

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.stack.html

#### 예제1. single level columns를 갖는 데이터

In [5]:
df = pd.DataFrame([[10,11],[12,13]], 
                  index=['cat','dog'],
                 columns=['weight','height'])
df

Unnamed: 0,weight,height
cat,10,11
dog,12,13


- stack() : 컬럼이 인덱스 마지막 레벨로 변경 -> 시리즈 데이터로 변환

In [6]:
df.stack()

cat  weight    10
     height    11
dog  weight    12
     height    13
dtype: int64

#### 예제2. multi-level columns을 갖는 데이터1

In [7]:
mulicol = pd.MultiIndex.from_tuples([('weight','kg'),
                                    ('weight','pounds')])
df2 = pd.DataFrame([[10,11],[12,13]], 
                  index=['cat','dog'],
                 columns=mulicol)
df2

Unnamed: 0_level_0,weight,weight
Unnamed: 0_level_1,kg,pounds
cat,10,11
dog,12,13


- stack() : 컬럼의 마지막 레벨이 인덱스의 마지막 레벨로 이동

In [9]:
df2.stack(future_stack=True)

Unnamed: 0,Unnamed: 1,weight
cat,kg,10
cat,pounds,11
dog,kg,12
dog,pounds,13


#### 예제3. multi-level columns을 갖는 데이터2

In [11]:
mulicol2 = pd.MultiIndex.from_tuples([('weight','kg'),
                                    ('height','m')])
df3 = pd.DataFrame([[10,1.1],[12,1.3]], 
                  index=['cat','dog'],
                 columns=mulicol2)
df3

Unnamed: 0_level_0,weight,height
Unnamed: 0_level_1,kg,m
cat,10,1.1
dog,12,1.3


- stack(level=-1)

In [14]:
df3.stack(future_stack=True)
df3.stack(-1, future_stack=True)
df3.stack(level=-1, future_stack=True)

Unnamed: 0,Unnamed: 1,weight,height
cat,kg,10.0,
cat,m,,1.1
dog,kg,12.0,
dog,m,,1.3


Unnamed: 0,Unnamed: 1,weight,height
cat,kg,10.0,
cat,m,,1.1
dog,kg,12.0,
dog,m,,1.3


Unnamed: 0,Unnamed: 1,weight,height
cat,kg,10.0,
cat,m,,1.1
dog,kg,12.0,
dog,m,,1.3


- stack(0) : 컬럼의 첫번째 레벨이 인덱스의 마지막 레벨로 이동

In [16]:
df3
df3.stack(level=0, future_stack=True)

Unnamed: 0_level_0,weight,height
Unnamed: 0_level_1,kg,m
cat,10,1.1
dog,12,1.3


Unnamed: 0,Unnamed: 1,kg,m
cat,weight,10.0,
cat,height,,1.1
dog,weight,12.0,
dog,height,,1.3


- stack([0,1]) : 컬럼의 두 레벨이 인덱스의 마지막 두 레벨로 이동

In [17]:
df3.stack(level=[0,1], future_stack=True)

cat  weight  kg    10.0
     height  m      1.1
dog  weight  kg    12.0
     height  m      1.3
dtype: float64

#### 예제4. multi-level columns을 갖는 데이터3 : 결측치를 포함하는 경우

In [18]:
mulicol2 = pd.MultiIndex.from_tuples([('weight','kg'),
                                    ('height','m')])
df4 = pd.DataFrame([[None,1.1],[12,1.3]], 
                  index=['cat','dog'],
                 columns=mulicol2)
df4

Unnamed: 0_level_0,weight,height
Unnamed: 0_level_1,kg,m
cat,,1.1
dog,12.0,1.3


- stack()

In [27]:
df4.stack(future_stack=True)
df4.stack(future_stack=False)

Unnamed: 0,Unnamed: 1,weight,height
cat,kg,,
cat,m,,1.1
dog,kg,12.0,
dog,m,,1.3


Unnamed: 0,Unnamed: 1,weight,height
cat,m,,1.1
dog,kg,12.0,
dog,m,,1.3


- stack(, dropna=True|False)

In [28]:
df4.stack(dropna=True)

Unnamed: 0,Unnamed: 1,weight,height
cat,m,,1.1
dog,kg,12.0,
dog,m,,1.3


In [30]:
df4.stack(dropna=False)

Unnamed: 0,Unnamed: 1,weight,height
cat,kg,,
cat,m,,1.1
dog,kg,12.0,
dog,m,,1.3


### **2. 언스태킹(unstacking)**

![image.png](https://pandas.pydata.org/docs/_images/reshaping_unstack.png)

- **DataFrame.unstack(level=- 1, fill_value=None, sort=True)**
    - level : int, str, list, default=-1
        - 언스태킹을 적용하는 레벨
        - 기본값은 마지막 레벨 : 언스태킹 결과 항상 마지막 레벨로 이동
    - fill_value : int, str or dict
        - 언스태킹 결과 결측치는 NaN으로 대체
    - sort : bool, default True
        - 멀티인덱스 컬럼의 레벨

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.unstack.html

- 예제5. 시리즈 데이터

In [31]:
idx = pd.MultiIndex.from_tuples([('one','a'),('one','b'),('two','a'),('two','b')])
s = pd.Series(np.arange(1.0,5.0), index=idx)
s

one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64

- unstack() : 마지막레벨로 unstacking

In [32]:
s.unstack()
s.unstack(-1)
s.unstack(level=-1)
s.unstack(level=1)

Unnamed: 0,a,b
one,1.0,2.0
two,3.0,4.0


Unnamed: 0,a,b
one,1.0,2.0
two,3.0,4.0


Unnamed: 0,a,b
one,1.0,2.0
two,3.0,4.0


Unnamed: 0,a,b
one,1.0,2.0
two,3.0,4.0


- unstack(level=0)

In [33]:
s
s.unstack(level=0)

one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64

Unnamed: 0,one,two
a,1.0,3.0
b,2.0,4.0


#### 예제6. 행이 MultiIndex를 갖는 데이터

In [36]:
lists = ['bar bar baz baz foo foo qux qux'.split(),
         'one two'.split()*4]
lists
idx = pd.MultiIndex.from_arrays(lists, names=['first', 'second'])
df = pd.DataFrame(np.round(np.random.randn(8,2),2),
                 index=idx, columns=['A','B'])
df

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.13,0.02
bar,two,0.21,-1.75
baz,one,1.58,-0.07
baz,two,-0.67,-0.58
foo,one,-0.09,0.55
foo,two,0.54,0.55
qux,one,-0.65,0.39
qux,two,-0.58,0.44


- unstack() : index의 마지막레벨이 컬럼의 마지막 레벨로 이동

In [37]:
df.unstack()

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,-0.13,0.21,0.02,-1.75
baz,1.58,-0.67,-0.07,-0.58
foo,-0.09,0.54,0.55,0.55
qux,-0.65,-0.58,0.39,0.44


- unstack(0) : index의 첫번째 레벨이 컬럼의 마지막 레벨로 이동

In [38]:
df.unstack(0)

Unnamed: 0_level_0,A,A,A,A,B,B,B,B
first,bar,baz,foo,qux,bar,baz,foo,qux
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
one,-0.13,1.58,-0.09,-0.65,0.02,-0.07,0.55,0.39
two,0.21,-0.67,0.54,-0.58,-1.75,-0.58,0.55,0.44


In [39]:
df.unstack(level='first')

Unnamed: 0_level_0,A,A,A,A,B,B,B,B
first,bar,baz,foo,qux,bar,baz,foo,qux
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
one,-0.13,1.58,-0.09,-0.65,0.02,-0.07,0.55,0.39
two,0.21,-0.67,0.54,-0.58,-1.75,-0.58,0.55,0.44


In [41]:
df.unstack('second')

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,-0.13,0.21,0.02,-1.75
baz,1.58,-0.67,-0.07,-0.58
foo,-0.09,0.54,0.55,0.55
qux,-0.65,-0.58,0.39,0.44


![image.png](https://pandas.pydata.org/docs/_images/reshaping_unstack_0.png)

- unstack(1) : index의 두첫번째 레벨이 컬럼의 마지막 레벨로 이동

![image.png](https://pandas.pydata.org/docs/_images/reshaping_unstack_1.png)

#### 예제7. 행과 열이 모두 MultiIndex를 갖는 데이터

In [45]:
idx = pd.MultiIndex.from_product(['bar baz foo qux'.split(), ('one', 'two')],
                                 names=['first', 'second'])
idx
cols = pd.MultiIndex.from_tuples([('A', 'cat'),('B', 'dog'),('B', 'cat'),('A', 'dog')], 
                                 names=['category','animal'])
df = pd.DataFrame(np.round(np.random.randn(8,4), 2),
                 index=idx, columns=cols)
df

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

Unnamed: 0_level_0,category,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,-0.84,-0.11,0.73,0.19
bar,two,0.86,0.79,0.35,-2.44
baz,one,-0.62,-0.72,-1.58,0.59
baz,two,0.58,-0.89,0.7,0.63
foo,one,-0.22,0.35,-0.68,-0.78
foo,two,2.56,-0.99,0.97,0.27
qux,one,0.34,0.5,-0.4,-0.17
qux,two,0.99,0.89,-0.03,-1.11


In [48]:
df['B']

Unnamed: 0_level_0,animal,dog,cat
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.11,0.73
bar,two,0.79,0.35
baz,one,-0.72,-1.58
baz,two,-0.89,0.7
foo,one,0.35,-0.68
foo,two,-0.99,0.97
qux,one,0.5,-0.4
qux,two,0.89,-0.03


In [49]:
df.loc[('bar','one'),('A','cat')]

np.float64(-0.84)

In [46]:
df.unstack()

category,A,A,B,B,B,B,A,A
animal,cat,cat,dog,dog,cat,cat,dog,dog
second,one,two,one,two,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
bar,-0.84,0.86,-0.11,0.79,0.73,0.35,0.19,-2.44
baz,-0.62,0.58,-0.72,-0.89,-1.58,0.7,0.59,0.63
foo,-0.22,2.56,0.35,-0.99,-0.68,0.97,-0.78,0.27
qux,0.34,0.99,0.5,0.89,-0.4,-0.03,-0.17,-1.11


In [52]:
df
new_df = df.iloc[[0,1,4,7],[1,2]]

Unnamed: 0_level_0,category,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,-0.84,-0.11,0.73,0.19
bar,two,0.86,0.79,0.35,-2.44
baz,one,-0.62,-0.72,-1.58,0.59
baz,two,0.58,-0.89,0.7,0.63
foo,one,-0.22,0.35,-0.68,-0.78
foo,two,2.56,-0.99,0.97,0.27
qux,one,0.34,0.5,-0.4,-0.17
qux,two,0.99,0.89,-0.03,-1.11


In [54]:
new_df
new_df.unstack()

Unnamed: 0_level_0,category,B,B
Unnamed: 0_level_1,animal,dog,cat
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,one,-0.11,0.73
bar,two,0.79,0.35
foo,one,0.35,-0.68
qux,two,0.89,-0.03


category,B,B,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,-0.11,0.79,0.73,0.35
foo,0.35,,-0.68,
qux,,0.89,,-0.03


- unstack( , **fill_value=**)

In [55]:
new_df.unstack(fill_value=0)

category,B,B,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,-0.11,0.79,0.73,0.35
foo,0.35,0.0,-0.68,0.0
qux,0.0,0.89,0.0,-0.03


----