# 🧩 Notebook 04: Merging, Joining & Concatenation

In this notebook, we explore how to:
- Merge datasets column-wise using `pd.merge()` and `safe_merge()`
- Concatenate datasets row-wise using `pd.concat()` and `safe_concat()`
- Handle key mismatches, suffixes, duplicates
- Apply post-merge operations like `.cumsum()` and `.rolling()`

In [1]:
# Notebook import setup
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from scripts import utils_io

In [2]:
from pathlib import Path
from scripts import utils_io, agg_utils

ASSETS_DIR = Path("../assets")

# Load individual cleaned regional loan files
loan_east = utils_io.load_csv(ASSETS_DIR / "loan_final_east.csv")
loan_west = utils_io.load_csv(ASSETS_DIR / "loan_final_west.csv")
loan_south = utils_io.load_csv(ASSETS_DIR / "loan_final_south.csv")
loan_north = utils_io.load_csv(ASSETS_DIR / "loan_final_north.csv")

# Concatenate using safe_concat
all_loans = agg_utils.safe_concat(
    [loan_east, loan_west, loan_south, loan_north],
    axis=0,
    ignore_index=True,
    check_columns=True
)

# Save for reuse
utils_io.save_csv(all_loans, ASSETS_DIR / "loan_final_all_regions.csv")


✅ Concatenated 4 DataFrames — shape: (40000, 8)


In [3]:
# Imports & Dataset Loading (Code Cell)
import pandas as pd
import numpy as np

# Load datasets
superstore = utils_io.load_csv("../assets/superstore_final.csv")
loan = utils_io.load_csv("../assets/loan_final_all_regions.csv")
covid = utils_io.load_csv("../assets/covid_final.csv")
weather = utils_io.load_csv("../assets/weather_final.csv")

# Convert date columns
superstore["order_date"] = pd.to_datetime(superstore["order_date"])
covid["date"] = pd.to_datetime(covid["date"])
weather["date"] = pd.to_datetime(weather["date"])

## 🧾 Dataset Preview

Let’s preview column names and head of each dataset.

In [4]:
for name, df in {
    "Superstore": superstore,
    "Loan": loan,
    "COVID": covid,
    "Weather": weather
}.items():
    print(f"\n📊 {name} — shape: {df.shape}")
    print(df.columns.tolist())
    display(df.head(2))


📊 Superstore — shape: (10000, 14)
['order_id', 'customer_id', 'customer_name', 'segment', 'region', 'order_date', 'ship_date', 'category', 'sub_category', 'product_name', 'sales', 'quantity', 'discount', 'profit']


Unnamed: 0,order_id,customer_id,customer_name,segment,region,order_date,ship_date,category,sub_category,product_name,sales,quantity,discount,profit
0,ord-10000,cust-9476,mr. michael lopez,home office,central,2020-01-01,2020-01-03,furniture,bookcases,bookcases model 1,1292.63,5,0.3,22.94
1,ord-10001,cust-9162,robert liu,home office,south,2020-01-02,2020-01-04,furniture,bookcases,bookcases model 2,1947.16,2,0.3,205.52



📊 Loan — shape: (40000, 8)
['customer_id', 'customer_name', 'age', 'income', 'loan_amount', 'loan_purpose', 'approved', 'region']


Unnamed: 0,customer_id,customer_name,age,income,loan_amount,loan_purpose,approved,region
0,1001,james reid,55,96964,36890,education,yes,east
1,1002,howard mathis,61,59846,56517,education,yes,east



📊 COVID — shape: (10000, 6)
['date', 'country', 'variant', 'new_cases', 'new_deaths', 'hospitalized']


Unnamed: 0,date,country,variant,new_cases,new_deaths,hospitalized
0,2020-01-01,germany,omicron,512,10,613
1,2020-01-02,germany,xbb,524,11,1911



📊 Weather — shape: (10000, 4)
['date', 'temperature_c', 'humidity', 'condition']


Unnamed: 0,date,temperature_c,humidity,condition
0,2022-01-01,28,81,snow
1,2022-01-02,4,90,snow


In [5]:
# Prepare monthly aggregates
superstore_monthly = superstore.copy()
superstore_monthly["order_date"] = superstore_monthly["order_date"].dt.to_period("M").dt.to_timestamp()
superstore_monthly = superstore_monthly.groupby("order_date")[["sales", "profit"]].sum().reset_index()

covid_monthly = covid.copy()
covid_monthly["date"] = covid_monthly["date"].dt.to_period("M").dt.to_timestamp()
covid_monthly = covid_monthly.groupby("date")[["new_cases", "new_deaths"]].sum().reset_index()
covid_monthly = covid_monthly.rename(columns={"date": "order_date"})

## 🔗 Merge Datasets

### A. Joining Monthly Superstore Sales with COVID Trends

We use `safe_merge()` to join two datasets on month-level date.

In [6]:
sales_covid = agg_utils.safe_merge(
    df1=superstore_monthly,
    df2=covid_monthly,
    on="order_date",
    how="inner",
    parse_dates=True
)
sales_covid.head()

✅ Merged on ['order_date'] using 'inner' join — shape: (329, 5)


Unnamed: 0,order_date,sales,profit,new_cases,new_deaths
0,2020-01-01,36627.07,1490.9,15558,313
1,2020-02-01,26120.18,1775.53,14543,277
2,2020-03-01,33591.53,1297.55,15392,339
3,2020-04-01,24267.89,1186.19,15148,290
4,2020-05-01,30944.1,1753.54,15542,328


### Region-wise Merge (Superstore + Loan)

In [7]:
region_sales = superstore[["region", "sales"]].groupby("region").sum(numeric_only=True).reset_index()
region_loans = loan[["region", "loan_amount"]].groupby("region").sum(numeric_only=True).reset_index()

merged_region = agg_utils.safe_merge(region_sales, region_loans, on="region", how="inner")
utils_io.save_csv(merged_region, "../assets/merged_region_sales_loans.csv")

✅ Merged on ['region'] using 'inner' join — shape: (3, 3)


### Date-wise Merge (COVID + Weather)

In [8]:
covid_monthly = covid.copy()
covid_monthly["month"] = covid_monthly["date"].dt.to_period("M").dt.to_timestamp()
weather["month"] = weather["date"].dt.to_period("M").dt.to_timestamp()

covid_summary = covid_monthly.groupby("month")[["new_cases", "new_deaths"]].sum().reset_index()
weather_summary = weather.groupby("month")[["temperature_c", "humidity"]].mean().reset_index()

merged_trends = agg_utils.safe_merge(covid_summary, weather_summary, on="month", how="inner")
utils_io.save_csv(merged_trends, "../assets/agg_merged_weather_covid_monthly.csv")

✅ Merged on ['month'] using 'inner' join — shape: (305, 5)


## 💥 Merge with Suffix Conflict

In [9]:
df1 = superstore[["customer_id", "region"]].drop_duplicates()
df2 = loan[["customer_id", "loan_amount"]].copy()
df2["customer_id"] = df2["customer_id"].astype(str)

merged_conflict = agg_utils.safe_merge(
    df1=df1,
    df2=df2,
    on="customer_id",
    suffixes=("_sales", "_loan"),
    how="inner"
)

✅ Merged on ['customer_id'] using 'inner' join — shape: (0, 3)


## 📈 Post-Merge: Rolling Average and Cumulative Sum

### A. Cumulative and Rolling Window Analysis on Sales Data

In [10]:
sales_covid["cum_sales"] = sales_covid["sales"].cumsum()
sales_covid["rolling_profit"] = sales_covid["profit"].rolling(window=3).mean()
sales_covid[["order_date", "sales", "cum_sales", "profit", "rolling_profit"]].tail()

Unnamed: 0,order_date,sales,cum_sales,profit,rolling_profit
324,2047-01-01,31448.7,9947147.79,1422.22,1690.163333
325,2047-02-01,27719.81,9974867.6,1762.42,1639.266667
326,2047-03-01,35454.26,10010321.86,1495.51,1560.05
327,2047-04-01,29930.82,10040252.68,1679.17,1645.7
328,2047-05-01,17422.03,10057674.71,912.72,1362.466667


### B. Cumulative Monthly Sales

In [11]:
superstore["month"] = superstore["order_date"].dt.to_period("M").dt.to_timestamp()
monthly_sales = superstore.groupby("month")["sales"].sum().reset_index()
monthly_sales["cumulative_sales"] = monthly_sales["sales"].cumsum()

### C. Rolling 3-Month Average

In [12]:
monthly_sales["rolling_avg_3mo"] = monthly_sales["sales"].rolling(window=3).mean()
utils_io.save_csv(monthly_sales, "../assets/agg_monthly_sales.csv")

## 🧩 Concatenation: Loan Region-wise Files

In [13]:
# Simulate loading region-wise loan files
loan_north = loan[loan["region"] == "North"]
loan_south = loan[loan["region"] == "South"]

# Concatenate using safe_concat
loan_combined = agg_utils.safe_concat([loan_north, loan_south])
loan_combined.head()

✅ Concatenated 2 DataFrames — shape: (0, 8)


Unnamed: 0,customer_id,customer_name,age,income,loan_amount,loan_purpose,approved,region


## ⚠️ Merge Conflict: Overlapping Column Names

In [14]:
# Simulate conflict merge by reusing customer_id

# Ensure consistent data type for merge key
df1 = superstore[["customer_id", "region"]].drop_duplicates().copy()
df2 = loan[["customer_id", "loan_amount"]].copy()

df1["customer_id"] = df1["customer_id"].astype(str)
df2["customer_id"] = df2["customer_id"].astype(str)

merged_conflict = agg_utils.safe_merge(
    df1=df1,
    df2=df2,
    on="customer_id",
    suffixes=("_sales", "_loan"),
    how="inner"
)
merged_conflict.head()

✅ Merged on ['customer_id'] using 'inner' join — shape: (0, 3)


Unnamed: 0,customer_id,region,loan_amount


In [15]:
# Exporting outputs
utils_io.save_csv(sales_covid, "../assets/merged_sales_covid.csv")
utils_io.save_csv(loan_combined, "../assets/loan_all_combined.csv")

## ✅ Summary

In this notebook, we:
- Used `safe_merge()` for robust joining with key validation
- Performed `.cumsum()` and `.rolling()` post-merge
- Used `safe_concat()` to combine region-wise datasets
- Handled merge conflicts using suffixes
- Exported merged outputs for further use