In [63]:
import pandas as pd

---
## **11. Merging Functions**

#### **Merging** is like SQL joins – you combine rows from two dataframes based on a common column.
```python
pd.merge(
    left,                      # Left DataFrame
    right,                     # Right DataFrame
    how="inner",               # Type of join: 'left', 'right', 'outer', 'inner'
    on=None,                   # Column(s) to join on (must be common in both)
    left_on=None,              # Column(s) from left DataFrame to join on
    right_on=None,             # Column(s) from right DataFrame to join on
    left_index=False,          # Use index from left DataFrame
    right_index=False,         # Use index from right DataFrame
    sort=False,                # Sort the result by join keys
    suffixes=("_x", "_y"),     # Suffix to apply to overlapping column names
    copy=True,                 # Always copy data (default True)
    indicator=False,           # Add column to indicate source of row (True / str)
    validate=None              # Check if merge is one-to-one, one-to-many etc.
)

```


In [64]:
df1 = pd.read_csv("datasets/employee.csv")
df1

Unnamed: 0,EmployeeID,Name,Age,DepartmentID,Salary
0,1001,Employee_1,50,201,71576
1,1002,Employee_2,36,203,64353
2,1003,Employee_3,29,201,42675
3,1004,Employee_4,42,203,77733
4,1005,Employee_5,40,202,69417
...,...,...,...,...,...
95,1096,Employee_96,58,204,54855
96,1097,Employee_97,56,201,32158
97,1098,Employee_98,43,204,68016
98,1099,Employee_99,48,201,32400


In [65]:
df2 = pd.read_csv("datasets/departments1.csv")
df2          

Unnamed: 0,DepartmentID,DepartmentName,Location
0,201,Department_1,Pune
1,202,Department_2,Mumbai
2,203,Department_3,Chennai
3,205,Department_4,Mumbai
4,206,Department_5,Mumbai
5,207,Department_6,Bangalore
6,208,Department_7,Chennai
7,209,Department_8,Chennai
8,210,Department_9,Chennai
9,211,Department_10,Mumbai


### **1. Different Types of Merges (Joins)**
#### **a) Inner Join – only matching rows**

In [29]:
pd.merge(df1,df2,on="DepartmentID",how="inner")

Unnamed: 0,EmployeeID,Name,Age,DepartmentID,Salary,DepartmentName,Location
0,1001,Employee_1,50,201,71576,Department_1,Pune
1,1002,Employee_2,36,203,64353,Department_3,Chennai
2,1003,Employee_3,29,201,42675,Department_1,Pune
3,1004,Employee_4,42,203,77733,Department_3,Chennai
4,1005,Employee_5,40,202,69417,Department_2,Mumbai
...,...,...,...,...,...,...,...
72,1092,Employee_92,55,203,63191,Department_3,Chennai
73,1093,Employee_93,54,202,72357,Department_2,Mumbai
74,1097,Employee_97,56,201,32158,Department_1,Pune
75,1099,Employee_99,48,201,32400,Department_1,Pune


#### **b) Left Join – all rows from left, matching from right**

In [30]:
pd.merge(df1,df2,on="DepartmentID",how="left")

Unnamed: 0,EmployeeID,Name,Age,DepartmentID,Salary,DepartmentName,Location
0,1001,Employee_1,50,201,71576,Department_1,Pune
1,1002,Employee_2,36,203,64353,Department_3,Chennai
2,1003,Employee_3,29,201,42675,Department_1,Pune
3,1004,Employee_4,42,203,77733,Department_3,Chennai
4,1005,Employee_5,40,202,69417,Department_2,Mumbai
...,...,...,...,...,...,...,...
95,1096,Employee_96,58,204,54855,,
96,1097,Employee_97,56,201,32158,Department_1,Pune
97,1098,Employee_98,43,204,68016,,
98,1099,Employee_99,48,201,32400,Department_1,Pune


#### **c) Right Join – all rows from right, matching from left**

In [31]:
pd.merge(df1,df2,on="DepartmentID",how="right")

Unnamed: 0,EmployeeID,Name,Age,DepartmentID,Salary,DepartmentName,Location
0,1001.0,Employee_1,50.0,201,71576.0,Department_1,Pune
1,1003.0,Employee_3,29.0,201,42675.0,Department_1,Pune
2,1007.0,Employee_7,32.0,201,48664.0,Department_1,Pune
3,1008.0,Employee_8,32.0,201,26636.0,Department_1,Pune
4,1015.0,Employee_15,45.0,201,72323.0,Department_1,Pune
...,...,...,...,...,...,...,...
79,,,,207,,Department_6,Bangalore
80,,,,208,,Department_7,Chennai
81,,,,209,,Department_8,Chennai
82,,,,210,,Department_9,Chennai


#### **d) Outer Join – all rows from both**

In [32]:
pd.merge(df1,df2,on="DepartmentID",how="outer")

Unnamed: 0,EmployeeID,Name,Age,DepartmentID,Salary,DepartmentName,Location
0,1001.0,Employee_1,50.0,201,71576.0,Department_1,Pune
1,1003.0,Employee_3,29.0,201,42675.0,Department_1,Pune
2,1007.0,Employee_7,32.0,201,48664.0,Department_1,Pune
3,1008.0,Employee_8,32.0,201,26636.0,Department_1,Pune
4,1015.0,Employee_15,45.0,201,72323.0,Department_1,Pune
...,...,...,...,...,...,...,...
102,,,,207,,Department_6,Bangalore
103,,,,208,,Department_7,Chennai
104,,,,209,,Department_8,Chennai
105,,,,210,,Department_9,Chennai


In [33]:
df2 = pd.read_csv("datasets/departments2.csv")

### **2. Merge on Different Column Names**

In [34]:
pd.merge(df1,df2,left_on="DepartmentID",right_on="Dep_ID")

Unnamed: 0,EmployeeID,Name,Age,DepartmentID,Salary,Dep_ID,DepartmentName,Location
0,1001,Employee_1,50,201,71576,201,Department_1,Pune
1,1002,Employee_2,36,203,64353,203,Department_3,Chennai
2,1003,Employee_3,29,201,42675,201,Department_1,Pune
3,1004,Employee_4,42,203,77733,203,Department_3,Chennai
4,1005,Employee_5,40,202,69417,202,Department_2,Mumbai
...,...,...,...,...,...,...,...,...
72,1092,Employee_92,55,203,63191,203,Department_3,Chennai
73,1093,Employee_93,54,202,72357,202,Department_2,Mumbai
74,1097,Employee_97,56,201,32158,201,Department_1,Pune
75,1099,Employee_99,48,201,32400,201,Department_1,Pune


### **3. Merge with Multiple Keys**

In [37]:
pd.merge(df1,df2,on=["DepartmentID"])

Unnamed: 0,EmployeeID,Name,Age,DepartmentID,Salary,DepartmentName,Location
0,1001,Employee_1,50,201,71576,Department_1,Pune
1,1002,Employee_2,36,203,64353,Department_3,Chennai
2,1003,Employee_3,29,201,42675,Department_1,Pune
3,1004,Employee_4,42,203,77733,Department_3,Chennai
4,1005,Employee_5,40,202,69417,Department_2,Mumbai
...,...,...,...,...,...,...,...
72,1092,Employee_92,55,203,63191,Department_3,Chennai
73,1093,Employee_93,54,202,72357,Department_2,Mumbai
74,1097,Employee_97,56,201,32158,Department_1,Pune
75,1099,Employee_99,48,201,32400,Department_1,Pune


---
## **12. Concat Functions**
#### **Concat** :to stack or combine DataFrames either vertically (rows) or horizontally (columns).
```python
   pd.concat(objs, axis=0, join='outer', ignore_index=False)
   
   # objs: A list or tuple of DataFrames
   # axis=0: Stack vertically (default)
   # axis=1: Stack horizontally
   # join: 'outer' keeps all data (default), 'inner' keeps common columns/rows
   # ignore_index: If True, reset index in the result
```

In [44]:
df1 = pd.read_csv("datasets/concat_df1.csv")

In [49]:
df2 = pd.read_csv("datasets/concat_df2.csv")

In [51]:
df3 = pd.read_csv("datasets/concat_df3.csv")
df3

Unnamed: 0,Salary
0,44560
1,29775
2,35137
3,38838
4,24365
5,23586
6,34013
7,46661
8,47329
9,35926


### **1. Vertical Concatenation (Default)**

In [46]:
pd.concat([df1,df2])

Unnamed: 0,ID,Name,Age,Salary
0,1,Name_1,28.0,
1,2,Name_2,33.0,
2,3,Name_3,27.0,
3,4,Name_4,35.0,
4,5,Name_5,38.0,
5,6,Name_6,32.0,
6,7,Name_7,27.0,
7,8,Name_8,21.0,
8,9,Name_9,21.0,
9,10,Name_10,24.0,


### **2. Vertical + Reset Index**

In [47]:
pd.concat([df1,df2],ignore_index=True)

Unnamed: 0,ID,Name,Age,Salary
0,1,Name_1,28.0,
1,2,Name_2,33.0,
2,3,Name_3,27.0,
3,4,Name_4,35.0,
4,5,Name_5,38.0,
5,6,Name_6,32.0,
6,7,Name_7,27.0,
7,8,Name_8,21.0,
8,9,Name_9,21.0,
9,10,Name_10,24.0,


### **3. Horizontal Concatenation (Add Columns)**

In [52]:
pd.concat([df1,df3],axis=1)

Unnamed: 0,ID,Name,Age,Salary
0,1,Name_1,28,44560
1,2,Name_2,33,29775
2,3,Name_3,27,35137
3,4,Name_4,35,38838
4,5,Name_5,38,24365
5,6,Name_6,32,23586
6,7,Name_7,27,34013
7,8,Name_8,21,46661
8,9,Name_9,21,47329
9,10,Name_10,24,35926


### **4. Outer Join (Default)**

In [54]:
pd.concat([df1,df2],join="outer",ignore_index=True)

Unnamed: 0,ID,Name,Age,Salary
0,1,Name_1,28.0,
1,2,Name_2,33.0,
2,3,Name_3,27.0,
3,4,Name_4,35.0,
4,5,Name_5,38.0,
5,6,Name_6,32.0,
6,7,Name_7,27.0,
7,8,Name_8,21.0,
8,9,Name_9,21.0,
9,10,Name_10,24.0,


### **5. Inner Join (Only common columns)**

In [56]:
pd.concat([df1,df2],join="inner",ignore_index=True)

Unnamed: 0,ID,Name
0,1,Name_1
1,2,Name_2
2,3,Name_3
3,4,Name_4
4,5,Name_5
5,6,Name_6
6,7,Name_7
7,8,Name_8
8,9,Name_9
9,10,Name_10


### **6. Concatenation with Mismatched Columns**

In [59]:
pd.concat([df1,df3],ignore_index=True)

Unnamed: 0,ID,Name,Age,Salary
0,1.0,Name_1,28.0,
1,2.0,Name_2,33.0,
2,3.0,Name_3,27.0,
3,4.0,Name_4,35.0,
4,5.0,Name_5,38.0,
5,6.0,Name_6,32.0,
6,7.0,Name_7,27.0,
7,8.0,Name_8,21.0,
8,9.0,Name_9,21.0,
9,10.0,Name_10,24.0,


### **7. With Keys (Multilevel Index)**

In [61]:
pd.concat([df1, df2], keys=['first', 'second'])


Unnamed: 0,Unnamed: 1,ID,Name,Age,Salary
first,0,1,Name_1,28.0,
first,1,2,Name_2,33.0,
first,2,3,Name_3,27.0,
first,3,4,Name_4,35.0,
first,4,5,Name_5,38.0,
first,5,6,Name_6,32.0,
first,6,7,Name_7,27.0,
first,7,8,Name_8,21.0,
first,8,9,Name_9,21.0,
first,9,10,Name_10,24.0,
