In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# pandas 데이터 재구조화(reshaping)

- 피벗팅(pivoting)
- 스태킹(stacking)과 언스태킹(unstacking)
- 멜팅(melting)과 와이드투롱(wide_to_long)
- 교차표(crosstab)
- explode

## 3. 멜팅(melting)

- 식별자 변수로 사용될 id를 기준으로 원래 데이터셋에 있던 여러 개의 컬럼 이름을 'variable' 컬럼에 위에서 아래로 길게 쌓아놓고, 'value' 컬럼에 id와 variable에 해당하는 값을 넣어주는 방식으로 데이터를 재구조화

![image.png](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Ft1.daumcdn.net%2Fcfile%2Ftistory%2F25177F4E5863D58A0C)

- 출처 : https://rfriend.tistory.com/278

### **melt()**

DataFrame.**melt(id_vars=None, value_vars=None, var_name=None, value_name='value'**, col_level=None, ignore_index=True)

- id_vars : identifier variables로 사용될 컬럼(들)
- value_vars : unpivot할 컬럼(들). 지정하지 않는 경우 id_vars에 지정하지 않은 모든 컬럼(들)이 됨
- var_name : 기본값은 'None'. 'variable' 컬럼의 이름, 지정하지 않는 경우 frame.columns.name 또는 'variable'
- value_name : 기본값은 'value'. 'value'컬럼의 이름
- col_level : 컬럼들이 멀티인덱스인 경우 melt할 레벨
- ignore_index :  기본값은 True. True인 경우 원래 인덱스 무시하나 False인 경우 원래 인덱스 유지함

https://pandas.pydata.org/docs/reference/api/pandas.melt.html

![image.png](https://pandas.pydata.org/pandas-docs/stable/_images/reshaping_melt.png)

#### 예제 데이터1

In [8]:
df1 = pd.DataFrame({'first':['John','Mary'], 'last':['Doe','Bo'],
                    'height':[5.5, 6.0],'weight':[130,150]})
df1

Unnamed: 0,first,last,height,weight
0,John,Doe,5.5,130
1,Mary,Bo,6.0,150


- melt() : id_vars에 지정하지 않으면 모든 컬럼이 value_vars로 설정됨

In [9]:
df1.melt()

Unnamed: 0,variable,value
0,first,John
1,first,Mary
2,last,Doe
3,last,Bo
4,height,5.5
5,height,6.0
6,weight,130
7,weight,150


- melt(id_vars=[ ])

In [10]:
df1.melt(id_vars=['first','last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [11]:
df1.melt(id_vars=['first'])

Unnamed: 0,first,variable,value
0,John,last,Doe
1,Mary,last,Bo
2,John,height,5.5
3,Mary,height,6.0
4,John,weight,130
5,Mary,weight,150


- melt(id_vars=[ ], **var_name=**)

In [12]:
df1.melt(id_vars=['first','last'], var_name='body')

Unnamed: 0,first,last,body,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


- melt(id_vars=[ ], var_name= , **value_name=** )

In [13]:
df1.melt(id_vars=['first','last'], value_name='data')

Unnamed: 0,first,last,variable,data
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [14]:
df1.melt(id_vars=['first','last'], var_name='body', value_name='data')

Unnamed: 0,first,last,body,data
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


- melt(id_vars=[ ], var_name= , **value_vars=** )

In [16]:
df1.melt(id_vars=['first'], value_vars=['height'])

Unnamed: 0,first,variable,value
0,John,height,5.5
1,Mary,height,6.0


In [18]:
df1.melt(id_vars=['first'], value_vars=['height','weight'])

Unnamed: 0,first,variable,value
0,John,height,5.5
1,Mary,height,6.0
2,John,weight,130.0
3,Mary,weight,150.0


In [19]:
df1.melt(id_vars=['first','last'], value_vars=['height','weight'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [20]:
df1.melt(id_vars=['last'], value_vars=['height','weight'])

Unnamed: 0,last,variable,value
0,Doe,height,5.5
1,Bo,height,6.0
2,Doe,weight,130.0
3,Bo,weight,150.0


In [21]:
df1.melt(id_vars=['last'], value_vars=['height','weight'], var_name='body', value_name='data')

Unnamed: 0,last,body,data
0,Doe,height,5.5
1,Bo,height,6.0
2,Doe,weight,130.0
3,Bo,weight,150.0


#### 예제 데이터2

In [22]:
index = pd.MultiIndex.from_tuples([('Person','A'),('Person','B')])

df2 = pd.DataFrame({'first':['John','Mary'], 'last':['Doe','Bo'],
                    'height':[5.5, 6.0],'weight':[130,150]},
                  index=index)
df2

Unnamed: 0,Unnamed: 1,first,last,height,weight
Person,A,John,Doe,5.5,130
Person,B,Mary,Bo,6.0,150


In [24]:
df2.melt()

Unnamed: 0,variable,value
0,first,John
1,first,Mary
2,last,Doe
3,last,Bo
4,height,5.5
5,height,6.0
6,weight,130
7,weight,150


In [25]:
df2.melt(id_vars=['first','last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


- melt(id_vars=[ ], **ignore_index=True | False** )

In [26]:
df2.melt(ignore_index=True)

Unnamed: 0,variable,value
0,first,John
1,first,Mary
2,last,Doe
3,last,Bo
4,height,5.5
5,height,6.0
6,weight,130
7,weight,150


In [27]:
df2.melt(id_vars=['first','last'], ignore_index=True)

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [28]:
df2.melt(id_vars=['first','last'], ignore_index=False)

Unnamed: 0,Unnamed: 1,first,last,variable,value
Person,A,John,Doe,height,5.5
Person,B,Mary,Bo,height,6.0
Person,A,John,Doe,weight,130.0
Person,B,Mary,Bo,weight,150.0


In [29]:
df2.melt(ignore_index=False)

Unnamed: 0,Unnamed: 1,variable,value
Person,A,first,John
Person,B,first,Mary
Person,A,last,Doe
Person,B,last,Bo
Person,A,height,5.5
Person,B,height,6.0
Person,A,weight,130
Person,B,weight,150


In [30]:
df2.melt(id_vars=['last'], value_vars=['height','weight'], ignore_index=False)

Unnamed: 0,Unnamed: 1,last,variable,value
Person,A,Doe,height,5.5
Person,B,Bo,height,6.0
Person,A,Doe,weight,130.0
Person,B,Bo,weight,150.0


## 4. Wide-to-long

- melt()와 유사함
- 컬럼 매칭을 위한 사용자 조작이 가능

![image.png](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Ft1.daumcdn.net%2Fcfile%2Ftistory%2F23592A3C58662F7831)

- 출처 : https://rfriend.tistory.com/279

### **wide_to_long()**

pandas.wide_to_long(df, stubnames, i, j, sep='', suffix='\\d+')

- df : dataframe
- stubnames : stub 이름(들), wide 포맷 변수들은 stub 이름으로 시작할 것으로 가정함
- i : id 변수(들)로 사용될 컬럼(들)
- j : sub-observation 변수 이름. long 포맷에 suffix 이름
- sep : wide포맷에서 변수 이름 구분을 위해 사용하는 문자
- suffix : 정규표현식
  
https://pandas.pydata.org/docs/reference/api/pandas.wide_to_long.html#pandas.wide_to_long

#### 예제 데이터1

In [34]:
np.random.seed(123)
df = pd.DataFrame({'A1970': {0:'a',1:'b',2:'c'},
                   'A1980': {0:'d',1:'e',2:'f'},
                  'B1970': {0:2.5,1:1.2,2:0.7},
                  'B1980': {0:3.2,1:1.3,2:0.1},
                  'X': dict(zip(range(3), np.random.randn(3)))})
df['id'] = df.index
df

Unnamed: 0,A1970,A1980,B1970,B1980,X,id
0,a,d,2.5,3.2,-1.085631,0
1,b,e,1.2,1.3,0.997345,1
2,c,f,0.7,0.1,0.282978,2


In [38]:
result = pd.wide_to_long(df, ['A','B'], i='id', j='year')
result

Unnamed: 0_level_0,Unnamed: 1_level_0,X,A,B
id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1970,-1.085631,a,2.5
1,1970,0.997345,b,1.2
2,1970,0.282978,c,0.7
0,1980,-1.085631,d,3.2
1,1980,0.997345,e,1.3
2,1980,0.282978,f,0.1


In [39]:
result.index

MultiIndex([(0, 1970),
            (1, 1970),
            (2, 1970),
            (0, 1980),
            (1, 1980),
            (2, 1980)],
           names=['id', 'year'])

#### 예제 데이터2

In [48]:
df = pd.DataFrame({
    'fam_id':[1,1,1,2,2,2,3,3,3],
    'birth':[1,2,3,1,2,3,1,2,3],
    'ht1':[2.8, 2.9, 2.2, 2.0, 1.8, 1.9, 2.2, 2.3, 2.1],
    'ht2':[3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
})
df

Unnamed: 0,fam_id,birth,ht1,ht2
0,1,1,2.8,3.4
1,1,2,2.9,3.8
2,1,3,2.2,2.9
3,2,1,2.0,3.2
4,2,2,1.8,2.8
5,2,3,1.9,2.4
6,3,1,2.2,3.3
7,3,2,2.3,3.4
8,3,3,2.1,2.9


In [51]:
result = pd.wide_to_long(df, stubnames='ht', i=['fam_id', 'birth'], j='age')
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ht
fam_id,birth,age,Unnamed: 3_level_1
1,1,1,2.8
1,1,2,3.4
1,2,1,2.9
1,2,2,3.8
1,3,1,2.2
1,3,2,2.9
2,1,1,2.0
2,1,2,3.2
2,2,1,1.8
2,2,2,2.8


In [57]:
u = result.unstack()
u

Unnamed: 0_level_0,Unnamed: 1_level_0,ht,ht
Unnamed: 0_level_1,age,1,2
fam_id,birth,Unnamed: 2_level_2,Unnamed: 3_level_2
1,1,2.8,3.4
1,2,2.9,3.8
1,3,2.2,2.9
2,1,2.0,3.2
2,2,1.8,2.8
2,3,1.9,2.4
3,1,2.2,3.3
3,2,2.3,3.4
3,3,2.1,2.9


In [60]:
u.columns = u.columns.map('{0[0]}{0[1]}'.format)
u

Unnamed: 0_level_0,Unnamed: 1_level_0,ht1,ht2
fam_id,birth,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2.8,3.4
1,2,2.9,3.8
1,3,2.2,2.9
2,1,2.0,3.2
2,2,1.8,2.8
2,3,1.9,2.4
3,1,2.2,3.3
3,2,2.3,3.4
3,3,2.1,2.9


In [61]:
u.reset_index()

Unnamed: 0,fam_id,birth,ht1,ht2
0,1,1,2.8,3.4
1,1,2,2.9,3.8
2,1,3,2.2,2.9
3,2,1,2.0,3.2
4,2,2,1.8,2.8
5,2,3,1.9,2.4
6,3,1,2.2,3.3
7,3,2,2.3,3.4
8,3,3,2.1,2.9


#### 예제 데이터3

In [62]:
np.random.seed(3)
df=pd.DataFrame({'A(weekley)-2010': np.random.rand(3),
                 'A(weekley)-2011': np.random.rand(3),
                 'B(weekley)-2010': np.random.rand(3),
                 'B(weekley)-2011': np.random.rand(3),
                'X': np.random.randint(3, size=3)
                })
df['id'] = df.index
df

Unnamed: 0,A(weekley)-2010,A(weekley)-2011,B(weekley)-2010,B(weekley)-2011,X,id
0,0.550798,0.510828,0.125585,0.44081,0,0
1,0.708148,0.892947,0.207243,0.029876,2,1
2,0.290905,0.896293,0.051467,0.456833,1,2


In [65]:
pd.wide_to_long(df, stubnames=['A(weekley)','B(weekley)'], i='id', j='year', sep='-')

Unnamed: 0_level_0,Unnamed: 1_level_0,X,A(weekley),B(weekley)
id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2010,0,0.550798,0.125585
1,2010,2,0.708148,0.207243
2,2010,1,0.290905,0.051467
0,2011,0,0.510828,0.44081
1,2011,2,0.892947,0.029876
2,2011,1,0.896293,0.456833


#### 예제 데이터4. suffixes로 정수를 갖지 않는 경우

In [91]:
df = pd.DataFrame({'id':[1,2],
                  'name':['Alice','Bob'],
                  'math_score':[90,70],
                   'eng_score':[80,100]
                  })
df

Unnamed: 0,id,name,math_score,eng_score
0,1,Alice,90,80
1,2,Bob,70,100


In [92]:
pd.wide_to_long(df, stubnames=['math', 'eng'], i='id', j='subject', sep='_', suffix='score')

Unnamed: 0_level_0,Unnamed: 1_level_0,name,math,eng
id,subject,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,score,Alice,90,80
2,score,Bob,70,100


In [96]:
df = pd.DataFrame({'id':[1]*3+[2]*3+[3]*3,
                  'birth':[1,2,3]*3,
                  'score_one':[2.8, 3.1, 4.2, 3.1, 1.8, 1.9, 2.2, 2.3, 2.1],
                  'score_two':[3.4, 5.1, 4.3, 4.5, 4.7, 4.8, 4.2, 3.3, 3.9]
                  })
df

Unnamed: 0,id,birth,score_one,score_two
0,1,1,2.8,3.4
1,1,2,3.1,5.1
2,1,3,4.2,4.3
3,2,1,3.1,4.5
4,2,2,1.8,4.7
5,2,3,1.9,4.8
6,3,1,2.2,4.2
7,3,2,2.3,3.3
8,3,3,2.1,3.9


In [94]:
pd.wide_to_long(df, stubnames='score', i=['id', 'birth'], j='nth', sep='_', suffix=r'\w+')
# '\w => 0~9a~zA~Z

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
id,birth,nth,Unnamed: 3_level_1
1,1,one,2.8
1,1,two,3.4
1,2,one,3.1
1,2,two,5.1
1,3,one,4.2
1,3,two,4.3
2,1,one,3.1
2,1,two,4.5
2,2,one,1.8
2,2,two,4.7


pd.wide_to_long(df, stubnames, i, j, sep, suffix)
- df: 데이터프레임
- stubnames : 열이름의 공통 접두사(stub), 여러개 지정 가능(리스트로)
- i : 각 행을 고유하게 식별하는 열 (보통 id, index)
- j : stub 뒤에 따라 붙는 값을 새로운 열로 바꾸는 '길쭉한 인덱스 컬럼 이름'
- sep : stub과 그 뒤쪽을 구분하는 구분자 ('_', '-', '',...)
- suffix : stub 뒤에 붙은 부분의 정규표현식 패턴('\\w+' : `0~9a~zA~Z` 단어, '\\.+':문자열, '\d+':숫자)

----

## 5. 교차표 crosstab

- 2개 이상의 요인을 위한 교차표(cross tabulation) 계산

![image.png](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Ft1.daumcdn.net%2Fcfile%2Ftistory%2F274D52335866360E16)

### **crosstab()**

pandas.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False)

- index : 행
- columns : 열
- values : factor에 의해 집계할 값들
- aggfunc : 계산 함수
- margins : 행/열 소계
- dropna : 모두 결측치를 갖는 컬럼은 포함하지 않음

https://pandas.pydata.org/docs/reference/api/pandas.crosstab.html

#### 예제 데이터1. 일차원 데이터들을 이용한 교차표

In [99]:
a = np.array(['foo']*4+['bar']*4+['foo']*3, dtype=object)
b = np.array(['one']*3+['two', 'one']*2+['two']*3+['one'], dtype=object)
c = np.array(['dull', 'dull', 'shiny','dull','dull','shiny',
              'dull', 'shiny','dull', 'shiny','shiny'], dtype=object)
a
b
c

array(['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo',
       'foo', 'foo'], dtype=object)

array(['one', 'one', 'one', 'two', 'one', 'two', 'one', 'two', 'two',
       'two', 'one'], dtype=object)

array(['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'dull', 'shiny',
       'dull', 'shiny', 'shiny'], dtype=object)

- pd.**crosstab**()

In [100]:
pd.crosstab(a, [b,c])

col_0,one,one,two,two
col_1,dull,shiny,dull,shiny
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,2,0,0,2
foo,2,2,2,1


- pd.crosstab(**index=, columns=**)

In [103]:
pd.crosstab(index=a, columns=[b,c])

col_0,one,one,two,two
col_1,dull,shiny,dull,shiny
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,2,0,0,2
foo,2,2,2,1


- pd.crosstab(, **rownames=, colnames=**, )

In [104]:
pd.crosstab(index=a, columns=[b,c], rownames=['a'], colnames=['b','c'])

b,one,one,two,two
c,dull,shiny,dull,shiny
a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,2,0,0,2
foo,2,2,2,1


#### 예제 데이터2. 범주형(Categorical) 데이터를 이용한 교차표 생성

In [108]:
A = pd.Categorical(['a','b','b','a'], categories=['a','b','c'])
B = pd.Categorical(['d','d','e','d'], categories=['d','e','f'])
A
B
type(A)
A.codes

['a', 'b', 'b', 'a']
Categories (3, object): ['a', 'b', 'c']

['d', 'd', 'e', 'd']
Categories (3, object): ['d', 'e', 'f']

pandas.core.arrays.categorical.Categorical

array([0, 1, 1, 0], dtype=int8)

- pd.crosstab()

In [109]:
pd.crosstab(A,B)

col_0,d,e
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,0
b,1,1


- pd.crosstab(, **dropna=False**)

In [110]:
pd.crosstab(A,B, dropna=False)

col_0,d,e,f
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,0,0
b,1,1,0
c,0,0,0


#### 예제 데이터3. 데이터 프레임을 사용한 교차표 생성

In [112]:
df = pd.DataFrame(
    {
        'A': [1,2,2,2,2],
        'B': [3,3,4,4,4],
        'C': [1,1,np.nan,1,1],

    }
)
df

Unnamed: 0,A,B,C
0,1,3,1.0
1,2,3,1.0
2,2,4,
3,2,4,1.0
4,2,4,1.0


- pd.crosstab()

In [114]:
pd.crosstab(df.A, df.B)
pd.crosstab(index=df.A, columns=df.B)

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,3


B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,3


In [115]:
pd.crosstab(df.A, df.C)

C,1.0
A,Unnamed: 1_level_1
1,1
2,3


- pd.crosstab(, **normalize=True**)

In [116]:
pd.crosstab(index=df.A, columns=df.B, normalize=True)

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.2,0.0
2,0.2,0.6


- pd.crosstab(**margins=**)

In [117]:
pd.crosstab(index=df.A, columns=df.B, normalize=True, margins=True)

B,3,4,All
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.2,0.0,0.2
2,0.2,0.6,0.8
All,0.4,0.6,1.0


- pd.crosstab(**values= , aggfunc=**)

In [121]:
pd.crosstab(index=df.A, columns=df.B, values=df.C, aggfunc='sum')

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,
2,1.0,2.0


In [122]:
pd.crosstab(index=df.A, columns=df.B, values=df.C, aggfunc='mean')

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,
2,1.0,1.0


In [124]:
pd.crosstab(index=df.A, columns=df.B, values=df.C, aggfunc='sum', margins=True)

B,3,4,All
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.0,,1.0
2,1.0,2.0,3.0
All,2.0,2.0,4.0


## 6. explode

- 데이터프레임의 컬럼 내에 리스트와 같은 값들을 갖는 경우 컬럼의 리스트 요소들을 행으로 분리
- 리스트와 같은 요소를 시리즈의 값으로 갖는 경우 행으로 분리

### **explode()**
- Series.explode(ignore_index=False)
- DataFrame.explode(column, ignore_index=False)
- https://pandas.pydata.org/docs/reference/api/pandas.Series.explode.html
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.explode.htm

#### 예제 데이터1. 컬럼의 값으로 리스트를 갖는 데이터프레임

In [125]:
keys = ['panda1','panda2','panda3']
values = [['eats','shoots'],['shoots','levels'],['eats','leaves']]
df = pd.DataFrame({'keys':keys,
                  'values':values})
df

Unnamed: 0,keys,values
0,panda1,"[eats, shoots]"
1,panda2,"[shoots, levels]"
2,panda3,"[eats, leaves]"


- 리스트를 값을 갖는 컬럼을 여러 컬럼으로 분할하여 구성

In [130]:
df[['acts1','acts2']] = df['values'].apply(pd.Series)
df
df = df.drop(['acts1','acts2'], axis=1)

Unnamed: 0,keys,values,acts1,acts2
0,panda1,"[eats, shoots]",eats,shoots
1,panda2,"[shoots, levels]",shoots,levels
2,panda3,"[eats, leaves]",eats,leaves


- explode() 적용

In [131]:
df.explode('values')

Unnamed: 0,keys,values
0,panda1,eats
0,panda1,shoots
1,panda2,shoots
1,panda2,levels
2,panda3,eats
2,panda3,leaves


#### 예제 데이터2. 시리즈의 행 요소가 리스트를 포함하는 경우

In [132]:
s = pd.Series([[1,2,3], 'foo',[],['a','b']])
s

0    [1, 2, 3]
1          foo
2           []
3       [a, b]
dtype: object

In [134]:
s.explode()

0      1
0      2
0      3
1    foo
2    NaN
3      a
3      b
dtype: object

#### 예제 데이터3. 콤마로 구분된 문자열을 값으로 갖는 데이터프레임

In [135]:
df = pd.DataFrame([
        {'var1': 'a,b,c', 'var2': 1},
        {'var1': 'd,e,f', 'var2': 2},
])
df

Unnamed: 0,var1,var2
0,"a,b,c",1
1,"d,e,f",2


In [138]:
df.explode(column='var1')

Unnamed: 0,var1,var2
0,"a,b,c",1
1,"d,e,f",2


**참고. df.assign()**
- 데이터프레임에 새로운 컬럼을 할당
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.assign.html

In [141]:
df.var1 = df.var1.str.split(',')
df
df.explode('var1')

Unnamed: 0,var1,var2
0,"[a, b, c]",1
1,"[d, e, f]",2


Unnamed: 0,var1,var2
0,a,1
0,b,1
0,c,1
1,d,2
1,e,2
1,f,2


In [145]:
df = pd.DataFrame([
        {'var1': 'a,b,c', 'var2': 1},
        {'var1': 'd,e,f', 'var2': 2},
])
df
df.assign(var1= df.var1.str.split(','))
df.assign(var3= df.var1.str.split(','))

Unnamed: 0,var1,var2
0,"a,b,c",1
1,"d,e,f",2


Unnamed: 0,var1,var2
0,"[a, b, c]",1
1,"[d, e, f]",2


Unnamed: 0,var1,var2,var3
0,"a,b,c",1,"[a, b, c]"
1,"d,e,f",2,"[d, e, f]"


In [147]:
df = pd.DataFrame({
    'temp_C': [17.0, 21.0]},
    index=['Seoul','Suwon'])
df

Unnamed: 0,temp_C
Seoul,17.0
Suwon,21.0


In [149]:
df = df.assign(temp_f=lambda x:x.temp_C * 9 / 5 + 32)
df

Unnamed: 0,temp_C,temp_f
Seoul,17.0,62.6
Suwon,21.0,69.8


----