# 4.5 데이터프레임 인덱스 조작

# 데이터프레임 인덱스 설정 및 제거

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 데이터 프레임 생성
np.random.seed(0)
df1 = pd.DataFrame(np.vstack([list('ABCDE'),
                              np.round(np.random.rand(3, 5), 2)]).T,
                   columns=["C1", "C2", "C3", "C4"])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [3]:
# set_index: 기존 열을 인덱스로 사용
df2 = df1.set_index("C1")
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


In [4]:
# 새로 인덱스 지정시 기존 인덱스는 사라짐
df2.set_index("C2")

Unnamed: 0_level_0,C3,C4
C2,Unnamed: 1_level_1,Unnamed: 2_level_1
0.55,0.65,0.79
0.72,0.44,0.53
0.6,0.89,0.57
0.54,0.96,0.93
0.42,0.38,0.07


In [5]:
# reset_index: 인덱스를 보통의 자료열로 복구 - 인덱스는 정수 인덱스로 바뀜
df2.reset_index()

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [6]:
# drop 옵션을 주면 자료가 복구되지 않고 버려짐
df2.reset_index(drop=True)

Unnamed: 0,C2,C3,C4
0,0.55,0.65,0.79
1,0.72,0.44,0.53
2,0.6,0.89,0.57
3,0.54,0.96,0.93
4,0.42,0.38,0.07


# 연습 문제 4.5.1

5명의 학생의 국어, 영어, 수학 점수를 나타내는 데이터프레임을 다음과 같이 만든다.

1. 학생 이름을 나타내는 열을 포함시키지 않고 데이터프레임 `df_score1` 을 생성한 후, `df_score1.index` 속성에 학생 이름을 나타내는 열을 지정하여 인덱스를 지정한다. `reset_index` 명령으로 이 인덱스 열을 명령으로 일반 데이터열로 바꾸여 데이터프레임 `df_score2`을 만든다.
2. 학생 이름을 나타내는 열이 일반 데이터 열을 포함하는 데이터프레임 `df_score2`에 `set_index` 명령을 적용하여 다시 학생 이름을 나타내는 열을 인덱스로 변경한다.

In [7]:
# 데이터 프레임 생성
np.random.seed(93)
df_score1 = pd.DataFrame(
    np.random.randint(0,101, size=(5,3)),
    columns = ["국어", "수학", "영어"]
)
df_score1

Unnamed: 0,국어,수학,영어
0,37,91,4
1,24,79,97
2,85,48,77
3,52,52,51
4,21,49,74


In [8]:
# 인덱스 지정하기
df_score1.index = ["학생1", "학생2", "학생3", "학생4", "학생5"]
df_score1.index.name = "학생"
df_score1

Unnamed: 0_level_0,국어,수학,영어
학생,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
학생1,37,91,4
학생2,24,79,97
학생3,85,48,77
학생4,52,52,51
학생5,21,49,74


In [9]:
# 1. reset_index
df_score2 = df_score1.reset_index()
df_score2

Unnamed: 0,학생,국어,수학,영어
0,학생1,37,91,4
1,학생2,24,79,97
2,학생3,85,48,77
3,학생4,52,52,51
4,학생5,21,49,74


In [10]:
# 2. set_index
df_score2.set_index("학생")

Unnamed: 0_level_0,국어,수학,영어
학생,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
학생1,37,91,4
학생2,24,79,97
학생3,85,48,77
학생4,52,52,51
학생5,21,49,74


# 다중 인덱스

In [11]:
# 다중 열 인덱스
np.random.seed(0)
df3 = pd.DataFrame(np.round(np.random.randn(5, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C1", "C2", "C1", "C2"]])
df3

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


In [12]:
# 열 인덱스 이름 지정
df3.columns.names = ["Cidx1", "Cidx2"]
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


In [13]:
# 다중 행/열 인덱스
np.random.seed(0)
df4 = pd.DataFrame(np.round(np.random.randn(6, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C", "D", "C", "D"]],
                   index=[["M", "M", "M", "F", "F", "F"],
                          ["id_" + str(i + 1) for i in range(3)] * 2])
df4.columns.names = ["Cidx1", "Cidx2"]
df4.index.names = ["Ridx1", "Ridx2"]
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


# 행 인덱스와 열 인덱스 교환

In [14]:
# stack(열 인덱스 이름): 열 인덱스 -> 행 인덱스
df4.stack("Cidx1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx2,C,D
Ridx1,Ridx2,Cidx1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,A,1.76,0.4
M,id_1,B,0.98,2.24
M,id_2,A,1.87,-0.98
M,id_2,B,0.95,-0.15
M,id_3,A,-0.1,0.41
M,id_3,B,0.14,1.45
F,id_1,A,0.76,0.12
F,id_1,B,0.44,0.33
F,id_2,A,1.49,-0.21
F,id_2,B,0.31,-0.85


In [15]:
# 정수로도 사용 가능
df4.stack(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx1,A,B
Ridx1,Ridx2,Cidx2,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,C,1.76,0.98
M,id_1,D,0.4,2.24
M,id_2,C,1.87,0.95
M,id_2,D,-0.98,-0.15
M,id_3,C,-0.1,0.14
M,id_3,D,0.41,1.45
F,id_1,C,0.76,0.44
F,id_1,D,0.12,0.33
F,id_2,C,1.49,0.31
F,id_2,D,-0.21,-0.85


In [16]:
# unstack(행 인덱스 이름): 행 인덱스 -> 열 인덱스
df4.unstack("Ridx2")

Cidx1,A,A,A,A,A,A,B,B,B,B,B,B
Cidx2,C,C,C,D,D,D,C,C,C,D,D,D
Ridx2,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3
Ridx1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
F,0.76,1.49,-2.55,0.12,-0.21,0.65,0.44,0.31,0.86,0.33,-0.85,-0.74
M,1.76,1.87,-0.1,0.4,-0.98,0.41,0.98,0.95,0.14,2.24,-0.15,1.45


In [17]:
# stack과 마찬가지로 정수 사용가능
df4.unstack(0)

Cidx1,A,A,A,A,B,B,B,B
Cidx2,C,C,D,D,C,C,D,D
Ridx1,F,M,F,M,F,M,F,M
Ridx2,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
id_1,0.76,1.76,0.12,0.4,0.44,0.98,0.33,2.24
id_2,1.49,1.87,-0.21,-0.98,0.31,0.95,-0.85,-0.15
id_3,-2.55,-0.1,0.65,0.41,0.86,0.14,-0.74,1.45


# 다중 인덱스가 있는 경우의 인덱싱

In [18]:
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


In [19]:
# 튜플 형식으로 지정해야한다.
df4[ ("A", "C") ]

Ridx1  Ridx2
M      id_1     1.76
       id_2     1.87
       id_3    -0.10
F      id_1     0.76
       id_2     1.49
       id_3    -2.55
Name: (A, C), dtype: float64

In [20]:
# loc도 튜플 형식으로 지정
df4.loc[ ("M","id_1"), :]

Cidx1  Cidx2
A      C        1.76
       D        0.40
B      C        0.98
       D        2.24
Name: (M, id_1), dtype: float64

In [21]:
# 다중 인덱스에서 데이터 갱신
df4.loc[ ("M","id_1"), ("A", "C")] = 100
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,100.0,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


In [22]:
# iloc에서는 튜플 형태의 다중 인덱스 사용 불가 - 기존과 같음
df4.iloc[0:, 0:2]

Unnamed: 0_level_0,Cidx1,A,A
Unnamed: 0_level_1,Cidx2,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2
M,id_1,100.0,0.4
M,id_2,1.87,-0.98
M,id_3,-0.1,0.41
F,id_1,0.76,0.12
F,id_2,1.49,-0.21
F,id_3,-2.55,0.65


In [23]:
# 한개 값만 넣은 경우 가장 상위의 값으로 판단하고 출력
# 따라서 두 번째 인덱스 값을 넣으면 오류 발생
df4["A"]

Unnamed: 0_level_0,Cidx2,C,D
Ridx1,Ridx2,Unnamed: 2_level_1,Unnamed: 3_level_1
M,id_1,100.0,0.4
M,id_2,1.87,-0.98
M,id_3,-0.1,0.41
F,id_1,0.76,0.12
F,id_2,1.49,-0.21
F,id_3,-2.55,0.65


In [24]:
# loc에서도 한개 값만 넣은 경우 가장 상위의 값으로 판단하고 출력
df4.loc["M"]

Cidx1,A,A,B,B
Cidx2,C,D,C,D
Ridx2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
id_1,100.0,0.4,0.98,2.24
id_2,1.87,-0.98,0.95,-0.15
id_3,-0.1,0.41,0.14,1.45


In [25]:
# 다중 인덱스에서 데이터 추가 - 행 추가
df4.loc[("All", "All"), :] = df4.sum()
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,100.0,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74
All,All,101.47,0.39,3.68,2.28


In [26]:
# 특정 레벨의 인덱스는 모두 뽑고 싶을 때
# : 대신 slice(None)을 사용
df4.loc[("M", slice(None)), ("A",slice(None))]

Unnamed: 0_level_0,Cidx1,A,A
Unnamed: 0_level_1,Cidx2,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2
M,id_1,100.0,0.4
M,id_2,1.87,-0.98
M,id_3,-0.1,0.41


# 다중 인덱스의 인덱스 순서 교환

In [27]:
# swaplevel(i, j, axis)
# i: 인덱스1, j: 인덱스2, axis는 디폴트가 0이며 여기선 행이다.
# 행 인덱스 레벨 교환
df5 = df4.swaplevel("Ridx1", "Ridx2")
df5

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,M,100.0,0.4,0.98,2.24
id_2,M,1.87,-0.98,0.95,-0.15
id_3,M,-0.1,0.41,0.14,1.45
id_1,F,0.76,0.12,0.44,0.33
id_2,F,1.49,-0.21,0.31,-0.85
id_3,F,-2.55,0.65,0.86,-0.74
All,All,101.47,0.39,3.68,2.28


In [28]:
# 열 인덱스 레벨 교환
df6 = df4.swaplevel(0,1, axis=1)
df6

Unnamed: 0_level_0,Cidx2,C,D,C,D
Unnamed: 0_level_1,Cidx1,A,A,B,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,100.0,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74
All,All,101.47,0.39,3.68,2.28


# 다중 인덱스가 있는 경우의 정렬

In [29]:
# 정렬시 인덱스 레벨을 지정해야한다.
# axis는 디폴트가 0이며 여기선 행이다.
df5.sort_index(level=0)

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
All,All,101.47,0.39,3.68,2.28
id_1,F,0.76,0.12,0.44,0.33
id_1,M,100.0,0.4,0.98,2.24
id_2,F,1.49,-0.21,0.31,-0.85
id_2,M,1.87,-0.98,0.95,-0.15
id_3,F,-2.55,0.65,0.86,-0.74
id_3,M,-0.1,0.41,0.14,1.45


In [30]:
# 열 인덱스 기준 정렬
df6.sort_index(axis=1, level=0)

Unnamed: 0_level_0,Cidx2,C,C,D,D
Unnamed: 0_level_1,Cidx1,A,B,A,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,100.0,0.98,0.4,2.24
M,id_2,1.87,0.95,-0.98,-0.15
M,id_3,-0.1,0.14,0.41,1.45
F,id_1,0.76,0.44,0.12,0.33
F,id_2,1.49,0.31,-0.21,-0.85
F,id_3,-2.55,0.86,0.65,-0.74
All,All,101.47,3.68,0.39,2.28


# 연습 문제 4.5.2

A 반 학생 5명과 B반 학생 5명의 국어, 영어, 수학 점수를 나타내는 데이터프레임을 다음과 같이 만든다.

1. "반", "번호", "국어", "영어", "수학" 을 열로 가지는 데이터프레임 `df_score3`을 만든다.

2. `df_score3`을 변형하여 1차 행 인덱스로 "반"을 2차 행 인덱스로 "번호"을 가지는 데이터프레임 `df_score4`을 만든다.

3. 데이터 프레임 `df_score4`에 각 학생의 평균을 나타내는 행을 오른쪽에 추가한다.

4. `df_score3`을 변형하여 행 인덱스로 "번호"를, 1차 열 인덱스로 "국어", "영어", "수학"을, 2차 열 인덱스로 "반"을 가지는 데이터프레임 `df_score5`을 만든다.

5. 데이터 프레임 `df_score5`에 각 반별 각 과목의 평균을 나타내는 행을 아래에 추가한다.

In [31]:
# 기초 배열 생성
np.random.seed(93)

a = []
for i in range(10):
    if i <=4:
        a = a + ["A",i+1] + list(np.random.randint(0,101,size=3))
    else:
        a = a + ["B",i+1] + list(np.random.randint(0,101,size=3))

data = np.array(a).reshape(10,5)
data

array([['A', '1', '37', '91', '4'],
       ['A', '2', '24', '79', '97'],
       ['A', '3', '85', '48', '77'],
       ['A', '4', '52', '52', '51'],
       ['A', '5', '21', '49', '74'],
       ['B', '6', '33', '8', '71'],
       ['B', '7', '80', '88', '9'],
       ['B', '8', '55', '11', '5'],
       ['B', '9', '44', '43', '46'],
       ['B', '10', '15', '32', '91']], dtype='<U2')

In [32]:
# 1. df_score3 생성
df_score3 = pd.DataFrame(data,
    columns = ["반", "번호", "국어", "영어", "수학"]
)

# 문자 형식의 숫자들 int로 바꿔줌
df_score3[["번호","국어","영어","수학"]] = df_score3[["번호","국어","영어","수학"]].astype(int)

df_score3

Unnamed: 0,반,번호,국어,영어,수학
0,A,1,37,91,4
1,A,2,24,79,97
2,A,3,85,48,77
3,A,4,52,52,51
4,A,5,21,49,74
5,B,6,33,8,71
6,B,7,80,88,9
7,B,8,55,11,5
8,B,9,44,43,46
9,B,10,15,32,91


In [33]:
# 2. df_score3을 변형하여 1차 행 인덱스로 "반", 2차 행 인덱스로 "번호"을 가지는 데이터프레임 df_score4
df_score4 = df_score3.set_index(["반","번호"])
df_score4

Unnamed: 0_level_0,Unnamed: 1_level_0,국어,영어,수학
반,번호,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,1,37,91,4
A,2,24,79,97
A,3,85,48,77
A,4,52,52,51
A,5,21,49,74
B,6,33,8,71
B,7,80,88,9
B,8,55,11,5
B,9,44,43,46
B,10,15,32,91


In [34]:
# 3. df_score4에 각 학생의 평균을 나타내는 행을 오른쪽에 추가한다.
df_score4["mean_score"] = round(df_score4.mean(axis=1),2)
df_score4

Unnamed: 0_level_0,Unnamed: 1_level_0,국어,영어,수학,mean_score
반,번호,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,37,91,4,44.0
A,2,24,79,97,66.67
A,3,85,48,77,70.0
A,4,52,52,51,51.67
A,5,21,49,74,48.0
B,6,33,8,71,37.33
B,7,80,88,9,59.0
B,8,55,11,5,23.67
B,9,44,43,46,44.33
B,10,15,32,91,46.0


In [35]:
# 4. df_score3을 변형하여 행 인덱스로 "번호"를, 1차 열 인덱스로 "국어", "영어", "수학"을, 
# 2차 열 인덱스로 "반"을 가지는 데이터프레임 df_score5을 만든다.

temp = df_score3.set_index(["번호","반"])
df_score5 =temp.unstack("반").sort_index(level=0)
df_score5

Unnamed: 0_level_0,국어,국어,영어,영어,수학,수학
반,A,B,A,B,A,B
번호,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,37.0,,91.0,,4.0,
2,24.0,,79.0,,97.0,
3,85.0,,48.0,,77.0,
4,52.0,,52.0,,51.0,
5,21.0,,49.0,,74.0,
6,,33.0,,8.0,,71.0
7,,80.0,,88.0,,9.0
8,,55.0,,11.0,,5.0
9,,44.0,,43.0,,46.0
10,,15.0,,32.0,,91.0


In [36]:
# 5. 데이터 프레임 df_score5에 각 반별 각 과목의 평균을 나타내는 행을 아래에 추가한다.
df_score5.loc["평균",:] = df_score5.mean()
df_score5

Unnamed: 0_level_0,국어,국어,영어,영어,수학,수학
반,A,B,A,B,A,B
번호,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,37.0,,91.0,,4.0,
2,24.0,,79.0,,97.0,
3,85.0,,48.0,,77.0,
4,52.0,,52.0,,51.0,
5,21.0,,49.0,,74.0,
6,,33.0,,8.0,,71.0
7,,80.0,,88.0,,9.0
8,,55.0,,11.0,,5.0
9,,44.0,,43.0,,46.0
10,,15.0,,32.0,,91.0
