# 데이터병합과 정렬

In [1]:
import numpy as pnp
import pandas as pd

In [43]:
# sample 데이터 생성

# 학생정보
students = [
    {"id":"1", "name":"John"},
    {"id":"2", "name":"Kim"},
    {"id":"3", "name":"Jung"},
    {"id":"4", "name":"Park"},
    {"id":"5", "name":"Hwang"},
    {"id":"6", "name":"Chu"},
    {"id":"7", "name":"Shul"},
    {"id":"8", "name":"Lee"},
    {"id":"9", "name":"Bok"},
    {"id":"10", "name":"Yun"},
]

# 수학점수
math_datas = [
    {"id" : "1","math" : "70"},
    {"id" : "10","math" : "100"},
    {"id" : "2","math" : "80"},
    {"id" : "3","math" : "70"},
    {"id" : "4","math" : "80"},
    {"id" : "5","math" : "40"},
    {"id" : "6","math" : "60"},
    {"id" : "7","math" : "70"},
    {"id" : "8","math" : "80"},
    {"id" : "9","math" : "90"},
]

# 과학 점수
science_datas = [
    {"no" : "10","science" : "100"},
    {"no" : "1","science" : "70"},
    {"no" : "2","science" : "80"},
    {"no" : "6","science" : "60"},
    {"no" : "7","science" : "70"},
    {"no" : "3","science" : "70"},
    {"no" : "8","science" : "80"},
    {"no" : "9","science" : "90"},
    {"no" : "4","science" : "80"},
    {"no" : "5","science" : "40"},
]

# 학사경고기록
warning_datas = [
    {"id" : "3","count" : 1},
    {"id" : "3","count" : 1},
    {"id" : "3","count" : 1},
    {"id" : "6","count" : 1},
    {"id" : "6","count" : 1},
    {"id" : "6","count" : 1},
    {"id" : "9","count" : 1},
    {"id" : "9","count" : 1},
    {"id" : "9","count" : 1},
]

df_student = pd.DataFrame(students)
df_math = pd.DataFrame(math_datas)
df_science = pd.DataFrame(science_datas)
df_warning = pd.DataFrame(warning_datas)

In [44]:
# 동일한 카운트의 점수를 Merge
student_score = df_student.merge(df_math)
student_score

Unnamed: 0,id,name,math
0,1,John,70
1,2,Kim,80
2,3,Jung,70
3,4,Park,80
4,5,Hwang,40
5,6,Chu,60
6,7,Shul,70
7,8,Lee,80
8,9,Bok,90
9,10,Yun,100


In [45]:
# 조인 키를 직접 지정
student_score = student_score.merge(df_science, left_on="id", right_on="no")
student_score.drop("no", axis=1, inplace=True)
student_score

Unnamed: 0,id,name,math,science
0,1,John,70,70
1,2,Kim,80,80
2,3,Jung,70,70
3,4,Park,80,80
4,5,Hwang,40,40
5,6,Chu,60,60
6,7,Shul,70,70
7,8,Lee,80,80
8,9,Bok,90,90
9,10,Yun,100,100


In [46]:
# 데이터 비대칭의 경우 - 인덱스 순서를 기준으로 overrite 해버림.
student_score2 = student_score.merge(df_warning, left_on="id", right_on="id")
student_score2

Unnamed: 0,id,name,math,science,count
0,3,Jung,70,70,1
1,3,Jung,70,70,1
2,3,Jung,70,70,1
3,6,Chu,60,60,1
4,6,Chu,60,60,1
5,6,Chu,60,60,1
6,9,Bok,90,90,1
7,9,Bok,90,90,1
8,9,Bok,90,90,1


In [52]:
# 학사경고횟수를 집계
warn_cnt = df_warning.groupby("id").sum()["count"].reset_index()
warn_cnt

Unnamed: 0,id,count
0,3,3
1,6,3
2,9,3


In [55]:
# 학사경고횟수를 집계
warn_cnt = df_warning.groupby("id").agg("sum").reset_index()
warn_cnt

Unnamed: 0,id,count
0,3,3
1,6,3
2,9,3


In [65]:
# outer 로 merge 해주기
result_outer = pd.merge(student_score, warn_cnt, how="outer")
result_outer['count'].fillna(0, inplace=True)
result_outer

Unnamed: 0,id,name,math,science,count
0,1,John,70,70,0.0
1,2,Kim,80,80,0.0
2,3,Jung,70,70,3.0
3,4,Park,80,80,0.0
4,5,Hwang,40,40,0.0
5,6,Chu,60,60,3.0
6,7,Shul,70,70,0.0
7,8,Lee,80,80,0.0
8,9,Bok,90,90,3.0
9,10,Yun,100,100,0.0


In [73]:
# result_outer 컬럼의 데이터 타입을 변경
result_outer.dtypes
result_outer['count'] = result_outer['count'].astype("int")
result_outer

Unnamed: 0,id,name,math,science,count
0,1,John,70,70,0
1,2,Kim,80,80,0
2,3,Jung,70,70,3
3,4,Park,80,80,0
4,5,Hwang,40,40,0
5,6,Chu,60,60,3
6,7,Shul,70,70,0
7,8,Lee,80,80,0
8,9,Bok,90,90,3
9,10,Yun,100,100,0


In [66]:
# inner 로 merge 해주기
result_inner = pd.merge(student_score, warn_cnt, 
                        left_on="id", right_on="id", how="inner")
result_inner

Unnamed: 0,id,name,math,science,count
0,3,Jung,70,70,3
1,6,Chu,60,60,3
2,9,Bok,90,90,3


### 데이터 정렬
- sort_values() 함수사용

In [79]:
result_outer.sort_values("count", ascending=False)[:3][["id", "name"]]

Unnamed: 0,id,name
2,3,Jung
5,6,Chu
8,9,Bok
