#  🧩 Notebook 05: Time Series Analysis in pandas: Datetime Indexing, Resampling & Trends



In [1]:
# Notebook import setup
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from scripts import utils_io

## 🧭 1. Notebook Setup

In [2]:
import pandas as pd
import numpy as np

from scripts import utils_io, agg_utils

# Load cleaned datasets
superstore = utils_io.load_csv("../assets/superstore_final.csv")
covid = utils_io.load_csv("../assets/covid_final.csv")
weather = utils_io.load_csv("../assets/weather_final.csv")

# Convert necessary columns to datetime
superstore["order_date"] = pd.to_datetime(superstore["order_date"])
covid["date"] = pd.to_datetime(covid["date"])
weather["date"] = pd.to_datetime(weather["date"])

# Optional: Set datetime index for superstore
superstore.set_index("order_date", inplace=True)

print("✅ Data loaded and datetime columns converted.")

✅ Data loaded and datetime columns converted.


📌 2. DateTime Column Exploration
> 🔍 This section explores the structure of datetime columns and extracts useful components (like month, day, weekday, etc.).

### 🔍 2.1 Explore datetime columns

In [3]:
# Check datetime index and dtypes
print(superstore.index.dtype)
print(covid["date"].dtype)
print(weather["date"].dtype)

# Display first few dates
print("🗓 Superstore order dates:")
display(superstore.index.to_series().head())

print("🦠 COVID dates:")
display(covid["date"].head())

print("🌦 Weather dates:")
display(weather["date"].head())

datetime64[ns]
datetime64[ns]
datetime64[ns]
🗓 Superstore order dates:


order_date
2020-01-01   2020-01-01
2020-01-02   2020-01-02
2020-01-03   2020-01-03
2020-01-04   2020-01-04
2020-01-05   2020-01-05
Name: order_date, dtype: datetime64[ns]

🦠 COVID dates:


0   2020-01-01
1   2020-01-02
2   2020-01-03
3   2020-01-04
4   2020-01-05
Name: date, dtype: datetime64[ns]

🌦 Weather dates:


0   2022-01-01
1   2022-01-02
2   2022-01-03
3   2022-01-04
4   2022-01-05
Name: date, dtype: datetime64[ns]

### 🧠 2.2 Extract features from datetime

In [4]:
# Reset index temporarily for feature extraction
superstore_reset = superstore.reset_index()

# Extract datetime parts
superstore_reset["year"] = superstore_reset["order_date"].dt.year
superstore_reset["month"] = superstore_reset["order_date"].dt.month
superstore_reset["day"] = superstore_reset["order_date"].dt.day
superstore_reset["weekday"] = superstore_reset["order_date"].dt.day_name()
superstore_reset["is_weekend"] = superstore_reset["weekday"].isin(["Saturday", "Sunday"])

superstore_reset[["order_date", "year", "month", "day", "weekday", "is_weekend"]].head()

Unnamed: 0,order_date,year,month,day,weekday,is_weekend
0,2020-01-01,2020,1,1,Wednesday,False
1,2020-01-02,2020,1,2,Thursday,False
2,2020-01-03,2020,1,3,Friday,False
3,2020-01-04,2020,1,4,Saturday,True
4,2020-01-05,2020,1,5,Sunday,True


## 📊 3. Time-based Aggregations and Trends
This section summarizes key business metrics over time (daily, monthly, etc.) to identify patterns or trends.

### 📈 3.1 Daily Sales & Profit Trend


In [5]:
daily_trend = (
    superstore
    .groupby("order_date")[["sales", "profit"]]
    .sum()
    .reset_index()
)

daily_trend.head()

Unnamed: 0,order_date,sales,profit
0,2020-01-01,1292.63,22.94
1,2020-01-02,1947.16,205.52
2,2020-01-03,1774.42,23.84
3,2020-01-04,591.01,64.85
4,2020-01-05,1969.55,58.99


### 📅 3.2 Monthly Aggregation

In [6]:
superstore["order_month"] = superstore.index.to_period("M")

monthly_sales = (
    superstore
    .groupby("order_month")[["sales", "profit"]]
    .sum()
    .reset_index()
)

monthly_sales["order_month"] = monthly_sales["order_month"].astype(str)
monthly_sales.head()

Unnamed: 0,order_month,sales,profit
0,2020-01,36627.07,1490.9
1,2020-02,26120.18,1775.53
2,2020-03,33591.53,1297.55
3,2020-04,24267.89,1186.19
4,2020-05,30944.1,1753.54


### 📌 3.3 COVID Monthly Aggregation

In [7]:
covid["month"] = covid["date"].dt.to_period("M")

covid_monthly = (
    covid
    .groupby("month")[["new_cases", "new_deaths"]]
    .sum()
    .reset_index()
)

covid_monthly["month"] = covid_monthly["month"].astype(str)
covid_monthly.head()

Unnamed: 0,month,new_cases,new_deaths
0,2020-01,15558,313
1,2020-02,14543,277
2,2020-03,15392,339
3,2020-04,15148,290
4,2020-05,15542,328


## 🔗 4. Time Series Merging & Rolling Metrics
This section combines datasets by date to understand how external events (e.g., COVID) influence business metrics.

### 🔁 4.1 Merge Monthly Sales with COVID Data

In [8]:
# Prepare date columns
superstore_monthly = monthly_sales.rename(columns={"order_month": "order_date"})
superstore_monthly["order_date"] = pd.to_datetime(superstore_monthly["order_date"])
covid_monthly["order_date"] = pd.to_datetime(covid_monthly["month"])

# Merge on month
sales_covid = agg_utils.safe_merge(
    df1=superstore_monthly,
    df2=covid_monthly,
    on="order_date",
    how="inner",
    parse_dates=True
)
sales_covid.head()

✅ Merged on ['order_date'] using 'inner' join — shape: (329, 6)


Unnamed: 0,order_date,sales,profit,month,new_cases,new_deaths
0,2020-01-01,36627.07,1490.9,2020-01,15558,313
1,2020-02-01,26120.18,1775.53,2020-02,14543,277
2,2020-03-01,33591.53,1297.55,2020-03,15392,339
3,2020-04-01,24267.89,1186.19,2020-04,15148,290
4,2020-05-01,30944.1,1753.54,2020-05,15542,328


### 📉 4.2 Rolling Averages & Cumulative Sales

In [9]:
# Rolling Profit and Cumulative Sales Trend
sales_covid["cum_sales"] = sales_covid["sales"].cumsum()
sales_covid["rolling_profit"] = sales_covid["profit"].rolling(window=3).mean()

sales_covid[["order_date", "sales", "cum_sales", "profit", "rolling_profit"]].tail()

Unnamed: 0,order_date,sales,cum_sales,profit,rolling_profit
324,2047-01-01,31448.7,9947147.79,1422.22,1690.163333
325,2047-02-01,27719.81,9974867.6,1762.42,1639.266667
326,2047-03-01,35454.26,10010321.86,1495.51,1560.05
327,2047-04-01,29930.82,10040252.68,1679.17,1645.7
328,2047-05-01,17422.03,10057674.71,912.72,1362.466667


## 🧩 5. Multi-key Merging Scenarios

This section helps simulate situations where datasets need to be merged on multiple keys, such as combining regional and temporal data.

### 🔍 5.1 Simulate Region-wise Monthly Superstore Summary

In [10]:
# Monthly Sales Summary by Region
superstore = utils_io.load_csv("../assets/superstore_final.csv")
superstore["order_date"] = pd.to_datetime(superstore["order_date"])
superstore["month"] = superstore["order_date"].dt.to_period("M").astype(str)

region_monthly_sales = (
    superstore
    .groupby(["region", "month"], as_index=False)
    .agg({"sales": "sum", "profit": "sum"})
    .rename(columns={"month": "order_month"})
)

region_monthly_sales.head()


Unnamed: 0,region,order_month,sales,profit
0,central,2020-01,8366.32,431.08
1,central,2020-02,6631.29,544.76
2,central,2020-03,6689.8,61.89
3,central,2020-04,269.01,4.47
4,central,2020-05,6252.0,271.01


### 🏥 5.2 Simulate Region-wise COVID Hospitalizations

In [11]:
# Add fake region assignments to covid data for demo
region_map = {
    "USA": "West",
    "India": "East",
    "Brazil": "South",
    "Germany": "North",
    "Canada": "Central"
}

covid["region"] = covid["country"].map(region_map)
covid["order_month"] = covid["date"].dt.to_period("M").astype(str)

region_covid = (
    covid
    .groupby(["region", "order_month"], as_index=False)
    .agg({"hospitalized": "mean"})
)

region_covid.head()

Unnamed: 0,region,order_month,hospitalized


### 🔗 5.3 Merge on Multiple Keys (region + order_month)

In [12]:
# Merge Monthly Sales and Hospitalizations on Region & Month
region_merged = agg_utils.safe_merge(
    df1=region_monthly_sales,
    df2=region_covid,
    on=["region", "order_month"],
    how="inner"
)

region_merged.sort_values(by=["region", "order_month"]).head()

✅ Merged on ['region', 'order_month'] using 'inner' join — shape: (0, 5)


Unnamed: 0,region,order_month,sales,profit,hospitalized


## 💡 6. Bonus Scenarios & Edge Case Handling
In this section, we’ll explore tricky merging situations and how to handle them safely.

### ❗ 6.1 Merge with Non-Overlapping Keys
This simulates when join keys do not match between two DataFrames, resulting in an empty merge or unexpected behavior.

In [13]:
# Simulate Failed Merge Due to Mismatched Keys
# Create dummy key conflict
loan = utils_io.load_csv("../assets/loan_final_all_regions.csv")
df_sales = superstore[["customer_id", "region"]].drop_duplicates().copy()
df_loan = loan[["customer_id", "loan_amount"]].copy()

# Force data type alignment for merge
df_loan["customer_id"] = df_loan["customer_id"].astype(str)

# Perform merge
merged_conflict = agg_utils.safe_merge(
    df1=df_sales,
    df2=df_loan,
    on="customer_id",
    suffixes=("_sales", "_loan"),
    how="inner"
)

merged_conflict.head()


✅ Merged on ['customer_id'] using 'inner' join — shape: (0, 3)


Unnamed: 0,customer_id,region,loan_amount


### 🔀 6.2 Column Name Conflicts & Suffix Resolution
Sometimes, both DataFrames have columns with the same name (but different values), requiring suffixes to distinguish them.

In [14]:
# Handle Conflicting Column Names Using Suffixes

# Extract and rename for conflict simulation
df1 = superstore[["customer_id", "sales"]].copy()
df2 = loan[["customer_id", "loan_amount"]].copy()
df2.rename(columns={"loan_amount": "sales"}, inplace=True)  # Intentional conflict

# 🔧 Convert merge key to same type
df1["customer_id"] = df1["customer_id"].astype(str)
df2["customer_id"] = df2["customer_id"].astype(str)

# ✅ Merge with suffixes to handle name conflict
merged_suffix = agg_utils.safe_merge(
    df1=df1,
    df2=df2,
    on="customer_id",
    suffixes=("_store", "_loan"),
    how="inner"
)

# Preview result
merged_suffix.head()

✅ Merged on ['customer_id'] using 'inner' join — shape: (0, 3)


Unnamed: 0,customer_id,sales_store,sales_loan


### 🧪 6.3 Defensive Merge with Type Coercion and Logging
Test the built-in type-checking & logging from safe_merge() by merging mismatched types and enabling verbose=True.

In [15]:
# Safe Merge with Automatic Type Alignment
df_loan["customer_id"] = df_loan["customer_id"].astype(str)

# ✅ Summary: Time Series Analysis in pandas

In this notebook, we explored a range of techniques for working with **temporal data** in pandas, including:

---

### 📆 1. Datetime Handling & Indexing
- Converted string-based date columns using `pd.to_datetime()`
- Set the datetime column as index for efficient time-based operations

---

### 🔎 2. Date Feature Extraction
- Extracted components like `.dt.year`, `.dt.month`, `.dt.day`, `.dt.weekday()`
- Created custom flags like `is_weekend` for temporal feature engineering

---

### 📊 3. Time-Based Aggregation
- Used `.groupby()` with `.dt.to_period("M")` for monthly summaries
- Compared with `.resample()` for continuous time intervals (e.g., monthly/weekly)

---

### 🔁 4. Rolling & Cumulative Metrics
- Computed rolling mean of profit using `.rolling(window=3).mean()`
- Generated cumulative sales with `.cumsum()` to track growth over time

---

### 🔗 5. Merging Time Series with External Data
- Merged **sales data** with **COVID trends** on monthly basis
- Combined **regional sales** with **regional hospitalizations**
- Demonstrated **multi-key merging** (`["region", "order_month"]`)

---

### 🚨 6. Defensive Programming for Merging
- Handled merge issues like:
  - Mismatched datatypes (`str` vs `int`)
  - Non-overlapping keys
  - Column name conflicts (resolved via suffixes)
- Used `agg_utils.safe_merge()` for robust, type-safe joins with logging

---

### 📈 Key Takeaway:
> Time series analysis in pandas isn't just about dates — it's about treating **time as a first-class citizen** for grouping, trend analysis, merging, and smoothing.

---