# Concatenation

In [27]:
import pandas as pd
import numpy as np

In [10]:
s1 = pd.Series([0,1], index = ['a','b'])

In [11]:
s2 = pd.Series([2,3,4], index = ['c','d','e'])

In [12]:
s3 = pd.Series([5,6], index = ['f','g'])

In [13]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [15]:
pd.concat([s1,s2,s3], axis = 1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [20]:
s4 = pd.concat([s1*5,s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

In [22]:
pd.concat([s1,s4], axis = 1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,5
f,,5
g,,6


In [24]:
pd.concat([s1,s2,s3],axis = 1, keys=["one", "two", "three"])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [39]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index = ['a','b','c'], columns = ['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [43]:
df2 = pd.DataFrame(5+np.arange(4).reshape(2,2), index =['a','c'], columns = ["three","four"])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [68]:
pd.concat([df1,df2], axis = 1)

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


Ignoring #concat 하려는 블록의 인덱스가 앞서 존재하는 행에 중복된다면 인덱스를 순차적으로 증가시킨다.

In [73]:
df3 = pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df4 = pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])
pd.concat([df3,df4])

Unnamed: 0,a,b,c,d
0,-0.210145,1.405218,0.303763,-0.839368
1,0.724568,-1.065694,0.45687,1.54634
2,-0.571511,-0.598863,-0.037409,-0.37187
0,-0.109672,-0.394926,,-0.633887
1,-0.970624,2.336003,,-0.714115


In [75]:
pd.concat([df3,df4], ignore_index=True)

Unnamed: 0,a,b,c,d
0,-0.210145,1.405218,0.303763,-0.839368
1,0.724568,-1.065694,0.45687,1.54634
2,-0.571511,-0.598863,-0.037409,-0.37187
3,-0.109672,-0.394926,,-0.633887
4,-0.970624,2.336003,,-0.714115


계층적 인덱싱(멀티 인덱싱, 멀티 컬럼)

In [107]:
s = pd.Series(np.random.randn(10), index = [["a","a","a","b","b","b","c","c","c","d"],
                                           [1,2,3,1,2,3,1,2,2,1]])
s

a  1   -0.260055
   2    0.056474
   3    1.217846
b  1    0.146498
   2   -0.581311
   3    1.021084
c  1   -0.194897
   2   -0.900915
   2    1.307024
d  1    2.532742
dtype: float64

In [108]:
s.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 0]])

In [109]:
s["b"]

1    0.146498
2   -0.581311
3    1.021084
dtype: float64

In [110]:
s["b":"c"]

b  1    0.146498
   2   -0.581311
   3    1.021084
c  1   -0.194897
   2   -0.900915
   2    1.307024
dtype: float64

In [113]:
s[("b",3)]

b  3    1.021084
dtype: float64

In [115]:
s[:,2]

a    0.056474
b   -0.581311
c   -0.900915
c    1.307024
dtype: float64

In [170]:
s2 = pd.DataFrame(np.arange(12).reshape(4,3),
              index=[["a","a","b","b"],[1,2,1,2,]],
             columns = [["Seoul","Seoul","Busan"],["Green","Red","Green"]])
#pd.Series 이면 안된다. 
s2

Unnamed: 0_level_0,Unnamed: 1_level_0,Seoul,Seoul,Busan
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [171]:
s2.index.names=["key1","key2"]
s2

Unnamed: 0_level_0,Unnamed: 1_level_0,Seoul,Seoul,Busan
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [172]:
s2.columns.names=["city","color"]
s2

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [173]:
s2["Seoul"]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [174]:
s2["Seoul","Green"]

key1  key2
a     1       0
      2       3
b     1       6
      2       9
Name: (Seoul, Green), dtype: int32

In [175]:
s2.loc["a"] #__s2["a"]__는 오류를 발생시킨다. 이유가 뭘까?

city,Seoul,Seoul,Busan
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


In [176]:
s2.loc[("a",1)]

city   color
Seoul  Green    0
       Red      1
Busan  Green    2
Name: (a, 1), dtype: int32

In [177]:
s2.loc["b",("Seoul","Red")]

key2
1     7
2    10
Name: (Seoul, Red), dtype: int32

In [178]:
s2.loc[("b",2),("Busan")]

color
Green    11
Name: (b, 2), dtype: int32

In [179]:
s2.loc[("b",2),("Busan")]

color
Green    11
Name: (b, 2), dtype: int32

In [183]:
s2.loc[("b",1),("Busan","Green")]

8

axis(축방향) 와 level(계층) 의 관점에서

In [186]:
s2.sort_index(axis = 0, level =0) #이는 인덱스의 0 Lv 인 key1을 기준으로 정렬한다.

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [188]:
s2.sort_index(axis = 0, level =1) #이는 인덱스의 1 Lv 인 key2을 기준으로 정렬한다.

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [190]:
s2.sort_index(axis = 0, level ="key1") #인덱스의 이름으로도 정렬이 가능하다.

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [192]:
s2.sort_index(axis = 1, level =0) #column 의 0계층(city) 를 기준으로

Unnamed: 0_level_0,city,Busan,Seoul,Seoul
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [195]:
s2.sort_index(axis = 1, level =1) #column 의 1계층(color) 를 기준으로

Unnamed: 0_level_0,city,Busan,Seoul,Seoul
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [197]:
s2.sort_index(axis = 1, level ="color")

Unnamed: 0_level_0,city,Busan,Seoul,Seoul
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [199]:
s2.sort_values(by=("Busan","Green")) #Busan 의 Green 을 기준으로 값들을 정렬

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


산술계산 .sum(axis = 0, level=0)

In [201]:
s2.sum(axis = 0, level=0)

city,Seoul,Seoul,Busan
color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [244]:
s2.sum(axis = 0, level=1)

city,Seoul,Seoul,Busan
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [245]:
s2.sum(axis = 1, level="color")

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [246]:
s2.sum(axis = 1, level="color").sum(axis=0,level=1)

color,Green,Red
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,16,8
2,28,14


In [247]:
df3=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
df3

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [248]:
df4=df3.set_index(["c","d"])
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [249]:
df3.set_index(["c","d"], drop = False) #특정 열을 인덱스로 가져오지만, 그 행을 없애지는 않는다. 

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [253]:
df5 = pd.DataFrame(np.arange(6).reshape(2,3),index=["Seoul","Busan"],columns=["one","two","three"])
df5.index.name = "city"
df5.columns.name = "number"

In [264]:
df6=df5.stack() #열로 있던 것을 행으로 연결한 것
df6

city   number
Seoul  one       0
       two       1
       three     2
Busan  one       3
       two       4
       three     5
dtype: int32

In [266]:
df5.unstack(level = "city")

number  city 
one     Seoul    0
        Busan    3
two     Seoul    1
        Busan    4
three   Seoul    2
        Busan    5
dtype: int32

In [268]:
df5.unstack(level = "1")

number  city 
one     Seoul    0
        Busan    3
two     Seoul    1
        Busan    4
three   Seoul    2
        Busan    5
dtype: int32

number  one  two  three
city                   
Seoul     0    1      2
Busan     3    4      5
