## 7. 合并与连接 merge & join

In [8]:
import pandas as pd
df_stu = pd.DataFrame({
    "student_id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "age": [20, 21, 19, 22, 20]
})
df_score = pd.DataFrame({
    "student_id": [1, 3, 4, 6, 8],
    "math_score": [85, 90, 78, 88, 92],
    "project_score": [88, 92, 80, 85, 95]
})
print(df_stu)
print()
print(df_score)

   student_id     name  age
0           1    Alice   20
1           2      Bob   21
2           3  Charlie   19
3           4    David   22
4           5      Eva   20

   student_id  math_score  project_score
0           1          85             88
1           3          90             92
2           4          78             80
3           6          88             85
4           8          92             95


#### 1. Merge 合并
方法参数：
1. 前2个参数表示要合并的 DataFrame
2. how：合并方式，默认 inner，可选值有 inner、outer、left、right，默认为 inner
3. on：指定用于合并的列名，必须在两个 DataFrame 中都存在
4. left_on 和 right_on：分别指定左侧和右侧 DataFrame 用于合并的列名，适用于两个df的列名不相同的情况
5. suffixes：用于指定合并后重名列的后缀，默认值为 ('_x', '_y') ，表示左侧 DataFrame 的列名后缀为 _x，右侧 DataFrame 的列名后缀为 _y


##### 1.1 内链接，只保留两个 DataFrame 中都有的键值

In [9]:
df_inner = pd.merge(
    df_stu,
    df_score,
    how="inner",
    on="student_id",
)
df_inner

Unnamed: 0,student_id,name,age,math_score,project_score
0,1,Alice,20,85,88
1,3,Charlie,19,90,92
2,4,David,22,78,80


##### 1.2 外链接，保留两个 DataFrame 中所有的键值

In [10]:
df_outer = pd.merge(
    df_stu,
    df_score,
    how="outer",
    on="student_id",
)
df_outer

Unnamed: 0,student_id,name,age,math_score,project_score
0,1,Alice,20.0,85.0,88.0
1,2,Bob,21.0,,
2,3,Charlie,19.0,90.0,92.0
3,4,David,22.0,78.0,80.0
4,5,Eva,20.0,,
5,6,,,88.0,85.0
6,8,,,92.0,95.0


##### 1.3 左/右链接，保留左/右侧 DataFrame 中所有的键值

In [11]:
df_left = pd.merge(
    df_stu,
    df_score,
    how="left",
    on="student_id",
)
df_left

Unnamed: 0,student_id,name,age,math_score,project_score
0,1,Alice,20,85.0,88.0
1,2,Bob,21,,
2,3,Charlie,19,90.0,92.0
3,4,David,22,78.0,80.0
4,5,Eva,20,,


In [13]:
df_right = pd.merge(
    df_stu,
    df_score,
    how="right",
    on="student_id",
)
df_right

Unnamed: 0,student_id,name,age,math_score,project_score
0,1,Alice,20.0,85,88
1,3,Charlie,19.0,90,92
2,4,David,22.0,78,80
3,6,,,88,85
4,8,,,92,95


##### 1.4 left_on 和 right_on 参数，用于列名不相同的情况

In [14]:
df_1 = pd.DataFrame({
    "key1": [1, 2, 3],
    "value1": ["A", "B", "C"]
})
df_2 = pd.DataFrame({
    "key2": [2, 3, 4],
    "value2": ["D", "E", "F"]
})
pd.merge(
    df_1,
    df_2,
    how="outer",
    left_on="key1",
    right_on="key2",
    suffixes=("_key1", "_key2")
)

Unnamed: 0,key1,value1,key2,value2
0,1.0,A,,
1,2.0,B,2.0,D
2,3.0,C,3.0,E
3,,,4.0,F


##### 1.4 left_index 和 right_index 参数，用于基于索引合并

In [16]:
df_stu_indexed = df_stu.set_index(df_stu["student_id"])
df_score_indexed = df_score.set_index(df_score["student_id"])
pd.merge(
    df_stu_indexed,
    df_score_indexed,
    how="outer",
    left_index=True,
    right_index=True,
    suffixes=("_stu", "_score")
)

Unnamed: 0_level_0,student_id_stu,name,age,student_id_score,math_score,project_score
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,Alice,20.0,1.0,85.0,88.0
2,2.0,Bob,21.0,,,
3,3.0,Charlie,19.0,3.0,90.0,92.0
4,4.0,David,22.0,4.0,78.0,80.0
5,5.0,Eva,20.0,,,
6,,,,6.0,88.0,85.0
8,,,,8.0,92.0,95.0


#### 2. Join 连接
Join 类似于 Merge 的简化版本，默认基于索引进行连接，方法调用者为左侧 DataFrame，传入的参数为右侧 DataFrame。

In [18]:
df_stu_indexed.join(df_score_indexed, how="outer", lsuffix="_stu", rsuffix="_score")

Unnamed: 0_level_0,student_id_stu,name,age,student_id_score,math_score,project_score
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,Alice,20.0,1.0,85.0,88.0
2,2.0,Bob,21.0,,,
3,3.0,Charlie,19.0,3.0,90.0,92.0
4,4.0,David,22.0,4.0,78.0,80.0
5,5.0,Eva,20.0,,,
6,,,,6.0,88.0,85.0
8,,,,8.0,92.0,95.0


##### 等价于

In [19]:
pd.merge(
    df_stu_indexed,
    df_score_indexed,
    how="outer",
    left_index=True,
    right_index=True,
    suffixes=("_stu", "_score")
)

Unnamed: 0_level_0,student_id_stu,name,age,student_id_score,math_score,project_score
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,Alice,20.0,1.0,85.0,88.0
2,2.0,Bob,21.0,,,
3,3.0,Charlie,19.0,3.0,90.0,92.0
4,4.0,David,22.0,4.0,78.0,80.0
5,5.0,Eva,20.0,,,
6,,,,6.0,88.0,85.0
8,,,,8.0,92.0,95.0
