<a href="https://colab.research.google.com/github/Sjleerodls/Data_Analysis/blob/main/lab_da/da11_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

# Series 인덱스

## 단일 계층 인덱스

In [82]:
s = pd.Series(data=np.random.rand(5))
s   #> Series 객체를 생성할 때 index를 설정하지 않으면 RangeIndex가 자동으로 만들어짐.

Unnamed: 0,0
0,0.481721
1,0.258744
2,0.616524
3,0.368401
4,0.195237


In [83]:
s.index     # 인덱스 - row label(행 레이블)

RangeIndex(start=0, stop=5, step=1)

In [84]:
s.values    # 값 -> np.ndarray

array([0.4817212 , 0.2587441 , 0.61652422, 0.36840085, 0.19523725])

In [85]:
s = pd.Series(data=np.random.rand(5),
              index=['a', 'b', 'c', 'd', 'e5a'])
s   #> 생성자에서 설정한 index 객체가 만들어짐.

Unnamed: 0,0
a,0.920494
b,0.637004
c,0.974038
d,0.239505
e5a,0.676146


In [86]:
s.index     # 문자열 5개를 index로 갖는 배열

Index(['a', 'b', 'c', 'd', 'e5a'], dtype='object')

In [87]:
s.index.nlevels     # Index.nlevels 속성(property) : 인덱스 계층(level)의 개수.

1

## 계층적 인덱스(hierachical index), Multi-level index

In [88]:
s = pd.Series(data=np.random.randn(6),
             index=[['m', 'm', 'f', 'f', 'u', 'u'],
                    [1, 2, 3, 1, 2, 3]])
s

Unnamed: 0,Unnamed: 1,0
m,1,0.312622
m,2,0.012422
f,3,-0.936883
f,1,0.24864
u,2,-0.234954
u,3,-1.477066


In [89]:
s.values

array([ 0.31262181,  0.01242185, -0.93688346,  0.24863956, -0.23495372,
       -1.47706584])

In [90]:
s.index     #> MultiIndex : 튜플들을 원소로 갖는 배열

MultiIndex([('m', 1),
            ('m', 2),
            ('f', 3),
            ('f', 1),
            ('u', 2),
            ('u', 3)],
           )

In [91]:
s.index.nlevels     # 2단계

2

## Indexing, Slicing

nlevels(인덱스의 계층 개수)가 2 이상인 MultiIndex를 사용해서 loc 속성을 이용할 때,
* 첫번째 레벨의 인덱스만 가지고 indexing, slicing을 할 수 있음.
* 두번째 이상의 인덱스만 가지고는 indexing, clicing을 할 수 없음.
* 튜플 형태의 인덱스로는 indexing, slicing이 가능함.

In [92]:
# indexing
s.loc['m']  # s['m']

Unnamed: 0,0
1,0.312622
2,0.012422


In [93]:
# slicing
# s.loc['m' : 'f']    #> UnsortedIndexError 발생

In [94]:
# s.loc[1]    #> KeyError 발생
s.loc[('m', 1)]     # 튜플 타입 인덱스는 사용 가능

np.float64(0.3126218064938325)

In [95]:
s.loc[[('m', 1), ('f', 1)]]

Unnamed: 0,Unnamed: 1,0
m,1,0.312622
f,1,0.24864


In [96]:
# s.loc[('m', 1) : ('f', 3)]  #> unsortedIndexError 발생

* `pd.Series.sort_values()` : 값들을 정렬
* `pd.Series.sort_index()` : 인덱스들을 정렬.

In [97]:
s

Unnamed: 0,Unnamed: 1,0
m,1,0.312622
m,2,0.012422
f,3,-0.936883
f,1,0.24864
u,2,-0.234954
u,3,-1.477066


In [98]:
s_idx_sort = s.sort_index()
s_idx_sort

Unnamed: 0,Unnamed: 1,0
f,1,0.24864
f,3,-0.936883
m,1,0.312622
m,2,0.012422
u,2,-0.234954
u,3,-1.477066


In [99]:
s_idx_sort['f':'m']     #> 인덱스들을 정렬한 후에는 slicing이 가능.

Unnamed: 0,Unnamed: 1,0
f,1,0.24864
f,3,-0.936883
m,1,0.312622
m,2,0.012422


In [100]:
s_idx_sort[('f', 3):('m', 10)]

Unnamed: 0,Unnamed: 1,0
f,3,-0.936883
m,1,0.312622
m,2,0.012422


## index swapping
인덱스의 레벨을 바꾸는 것.

In [101]:
# s.swaplevel()
s_swap = s.swaplevel(i=0, j=1).sort_index()   # 0번과 1번 인덱스를 바꾸겠다.
s_swap

Unnamed: 0,Unnamed: 1,0
1,f,0.24864
1,m,0.312622
2,m,0.012422
2,u,-0.234954
3,f,-0.936883
3,u,-1.477066


In [102]:
s_swap.loc[1:2]

Unnamed: 0,Unnamed: 1,0
1,f,0.24864
1,m,0.312622
2,m,0.012422
2,u,-0.234954


In [103]:
# nlevels=3인 인덱스를 갖는 시리즈
s = pd.Series(data=np.random.rand(6),
              index=[np.arange(1, 7),
                     ['A', 'A', 'A', 'B', 'B','B'],
                     ['aa', 'bb', 'aa', 'bb', 'aa', 'bb']])
s

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,A,aa,0.782452
2,A,bb,0.024879
3,A,aa,0.811503
4,B,bb,0.99159
5,B,aa,0.708418
6,B,bb,0.057049


In [104]:
s.index

MultiIndex([(1, 'A', 'aa'),
            (2, 'A', 'bb'),
            (3, 'A', 'aa'),
            (4, 'B', 'bb'),
            (5, 'B', 'aa'),
            (6, 'B', 'bb')],
           )

In [105]:
s.index.nlevels # 3, 3개씩 있으니 3

3

In [106]:
s.swaplevel()       # i=-2, j=-1 기본값 : 마지막 인덱스와 끝에서 두번쨰 인덱스를 서로 바꿔준다.

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,aa,A,0.782452
2,bb,A,0.024879
3,aa,A,0.811503
4,bb,B,0.99159
5,aa,B,0.708418
6,bb,B,0.057049


In [107]:
s.swaplevel(i=0, j=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
A,1,aa,0.782452
A,2,bb,0.024879
A,3,aa,0.811503
B,4,bb,0.99159
B,5,aa,0.708418
B,6,bb,0.057049


In [110]:
s.swaplevel(i=0, j=1)['B']

Unnamed: 0,Unnamed: 1,0
4,bb,0.99159
5,aa,0.708418
6,bb,0.057049


# DataFrame 계층적 인덱스

In [118]:
df = pd.DataFrame(data=np.random.rand(6, 3),
                  columns=['a', 'b', 'c'],
                  index=[['Fri', 'Fri', 'Sat', 'Sat', 'Sun','Sun'],
                         ['Lunch', 'Dinner']*3])
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.763056,0.782051,0.940255
Fri,Dinner,0.154356,0.141106,0.138781
Sat,Lunch,0.905081,0.355048,0.852519
Sat,Dinner,0.999221,0.867615,0.124914
Sun,Lunch,0.575017,0.980516,0.692963
Sun,Dinner,0.859022,0.810956,0.109323


In [120]:
df.values   #> 데이터프레임의 값들로 이루어진 2차원 ndarray

array([[0.7630558 , 0.78205079, 0.94025458],
       [0.15435569, 0.14110642, 0.13878128],
       [0.9050807 , 0.35504847, 0.85251896],
       [0.99922137, 0.86761477, 0.12491444],
       [0.57501683, 0.98051647, 0.69296281],
       [0.85902206, 0.81095616, 0.1093229 ]])

In [122]:
df.index    #> MultiIndex

MultiIndex([('Fri',  'Lunch'),
            ('Fri', 'Dinner'),
            ('Sat',  'Lunch'),
            ('Sat', 'Dinner'),
            ('Sun',  'Lunch'),
            ('Sun', 'Dinner')],
           )

In [124]:
df.index.nlevels

2

In [126]:
df.loc['Fri']   #> indexing

Unnamed: 0,a,b,c
Lunch,0.763056,0.782051,0.940255
Dinner,0.154356,0.141106,0.138781


In [128]:
df.loc['Fri' : 'Sat']   #> slicing

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.763056,0.782051,0.940255
Fri,Dinner,0.154356,0.141106,0.138781
Sat,Lunch,0.905081,0.355048,0.852519
Sat,Dinner,0.999221,0.867615,0.124914


In [132]:
# df.loc['Lunch']     #> KeyError 발생 - 두번째 레벨의 인덱스로는 인덱싱을 할 수 없음.

In [134]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.763056,0.782051,0.940255
Fri,Dinner,0.154356,0.141106,0.138781
Sat,Lunch,0.905081,0.355048,0.852519
Sat,Dinner,0.999221,0.867615,0.124914
Sun,Lunch,0.575017,0.980516,0.692963
Sun,Dinner,0.859022,0.810956,0.109323


In [136]:
df_swap = df.swaplevel()
df_swap

Unnamed: 0,Unnamed: 1,a,b,c
Lunch,Fri,0.763056,0.782051,0.940255
Dinner,Fri,0.154356,0.141106,0.138781
Lunch,Sat,0.905081,0.355048,0.852519
Dinner,Sat,0.999221,0.867615,0.124914
Lunch,Sun,0.575017,0.980516,0.692963
Dinner,Sun,0.859022,0.810956,0.109323


In [137]:
df_swap.loc['Lunch']

Unnamed: 0,a,b,c
Fri,0.763056,0.782051,0.940255
Sat,0.905081,0.355048,0.852519
Sun,0.575017,0.980516,0.692963


# DataFrame 컬럼 <--> Row 레이블

* `pd.DataFrame.set_index()` : 데이터프레임의 컬럼(들)을 인덱스(row 레이블)로 변환한 데이터프레임을 리턴.
* `pd.DataFrame.reset_index()` : 데이터프레임의 인덱스(들)을 컬럼으로 변환한 데이터프레임을 리턴.

## `reset_index`

In [139]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.763056,0.782051,0.940255
Fri,Dinner,0.154356,0.141106,0.138781
Sat,Lunch,0.905081,0.355048,0.852519
Sat,Dinner,0.999221,0.867615,0.124914
Sun,Lunch,0.575017,0.980516,0.692963
Sun,Dinner,0.859022,0.810956,0.109323


In [141]:
df.reset_index()    #> level = None 기본값, 모든 레벨의 인덱스를 컬럼으로 변환.

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.763056,0.782051,0.940255
1,Fri,Dinner,0.154356,0.141106,0.138781
2,Sat,Lunch,0.905081,0.355048,0.852519
3,Sat,Dinner,0.999221,0.867615,0.124914
4,Sun,Lunch,0.575017,0.980516,0.692963
5,Sun,Dinner,0.859022,0.810956,0.109323


In [143]:
df.reset_index(names=['day', 'time'])   #> 인덱스를  컬럼으로 변환할 때 컬럼의 이름을 설정.

Unnamed: 0,day,time,a,b,c
0,Fri,Lunch,0.763056,0.782051,0.940255
1,Fri,Dinner,0.154356,0.141106,0.138781
2,Sat,Lunch,0.905081,0.355048,0.852519
3,Sat,Dinner,0.999221,0.867615,0.124914
4,Sun,Lunch,0.575017,0.980516,0.692963
5,Sun,Dinner,0.859022,0.810956,0.109323


In [145]:
df.reset_index(level=0)     # 레벨 0의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_0,a,b,c
Lunch,Fri,0.763056,0.782051,0.940255
Dinner,Fri,0.154356,0.141106,0.138781
Lunch,Sat,0.905081,0.355048,0.852519
Dinner,Sat,0.999221,0.867615,0.124914
Lunch,Sun,0.575017,0.980516,0.692963
Dinner,Sun,0.859022,0.810956,0.109323


In [147]:
df.reset_index(level=1)     # 레벨 1의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_1,a,b,c
Fri,Lunch,0.763056,0.782051,0.940255
Fri,Dinner,0.154356,0.141106,0.138781
Sat,Lunch,0.905081,0.355048,0.852519
Sat,Dinner,0.999221,0.867615,0.124914
Sun,Lunch,0.575017,0.980516,0.692963
Sun,Dinner,0.859022,0.810956,0.109323


In [149]:
df.reset_index(level=[1, 0])

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.763056,0.782051,0.940255
1,Fri,Dinner,0.154356,0.141106,0.138781
2,Sat,Lunch,0.905081,0.355048,0.852519
3,Sat,Dinner,0.999221,0.867615,0.124914
4,Sun,Lunch,0.575017,0.980516,0.692963
5,Sun,Dinner,0.859022,0.810956,0.109323


## `set_index`

In [152]:
exam = pd.DataFrame(data={
    'class': [1] * 5 + [2] * 5,
    'id': np.arange(1, 11),
    'math': np.random.randint(101, size=10),
    'science' : np.random.randint(101, size=10)
})
exam

Unnamed: 0,class,id,math,science
0,1,1,66,26
1,1,2,96,65
2,1,3,38,87
3,1,4,86,12
4,1,5,82,14
5,2,6,28,31
6,2,7,26,33
7,2,8,85,72
8,2,9,99,13
9,2,10,4,89


In [155]:
# class = 1인 데이터만 출력
exam[exam['class'] == 1]

Unnamed: 0,class,id,math,science
0,1,1,66,26
1,1,2,96,65
2,1,3,38,87
3,1,4,86,12
4,1,5,82,14


In [160]:
exam_class = exam.set_index(keys='class')
exam_class

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,66,26
1,2,96,65
1,3,38,87
1,4,86,12
1,5,82,14
2,6,28,31
2,7,26,33
2,8,85,72
2,9,99,13
2,10,4,89


In [162]:
exam_class.loc[1]

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,66,26
1,2,96,65
1,3,38,87
1,4,86,12
1,5,82,14


In [165]:
exam_class_id = exam.set_index(keys=['class', 'id'])
exam_class_id

Unnamed: 0_level_0,Unnamed: 1_level_0,math,science
class,id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,66,26
1,2,96,65
1,3,38,87
1,4,86,12
1,5,82,14
2,6,28,31
2,7,26,33
2,8,85,72
2,9,99,13
2,10,4,89


In [168]:
exam_class_id.reset_index(level='class')
#> 인덱스가 이름을 가지고 있는 경우 reset_index 메서드의 level 파라미터로 문자열(들의 리스트)를 줄 수 있음.

Unnamed: 0_level_0,class,math,science
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,66,26
2,1,96,65
3,1,38,87
4,1,86,12
5,1,82,14
6,2,28,31
7,2,26,33
8,2,85,72
9,2,99,13
10,2,4,89


reset_index 예

In [170]:
exam

Unnamed: 0,class,id,math,science
0,1,1,66,26
1,1,2,96,65
2,1,3,38,87
3,1,4,86,12
4,1,5,82,14
5,2,6,28,31
6,2,7,26,33
7,2,8,85,72
8,2,9,99,13
9,2,10,4,89


In [178]:
# exam 데이터프레임에서 반별 과목들의 평균
exam_by_class = exam.groupby(by=['class'])[['math', 'science']].mean()
exam_by_class

Unnamed: 0_level_0,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,73.6,40.8
2,48.4,47.6


In [179]:
exam_by_class.reset_index()

Unnamed: 0,class,math,science
0,1,73.6,40.8
1,2,48.4,47.6
