In [1]:
# SECTION 1: FULL EXTRACTION
import pandas as pd

# Load dataset
df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])

# Display first 10 rows
display(df.head(10))

# Print dataset info
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print(f"Extracted {df.shape[0]} rows fully.")


Unnamed: 0,id,customer,date,amount,last_updated
0,4366,Costco,2025-04-02,1430,2025-04-02 00:18:00
1,9751,Costco,2025-04-02,1097,2025-04-02 07:30:00
2,7433,BestBuy,2025-04-02,1024,2025-04-02 23:25:00
3,6765,eBay,2025-04-02,1415,2025-04-02 22:51:00
4,1321,Amazon,2025-04-02,959,2025-04-02 23:20:00
5,5634,Costco,2025-04-02,1604,2025-04-02 08:55:00
6,3280,eBay,2025-04-03,756,2025-04-03 19:28:00
7,4136,Target,2025-04-03,1835,2025-04-03 15:23:00
8,6208,eBay,2025-04-03,447,2025-04-03 00:34:00
9,1571,Walmart,2025-04-03,1367,2025-04-03 18:28:00


Rows: 274, Columns: 5
Extracted 274 rows fully.


In [2]:
# SECTION 2: INCREMENTAL EXTRACTION
from datetime import datetime

# Step 1: Load last extraction timestamp (simulate)
try:
    with open('last_extraction.txt', 'r') as f:
        last_extracted = datetime.fromisoformat(f.read().strip())
except FileNotFoundError:
    last_extracted = datetime(2025, 4, 1)  # Default/fallback

# Step 2: Load dataset and filter based on last_updated
df = pd.read_csv('custom_data.csv', parse_dates=['last_updated'])
df_incremental = df[df['last_updated'] > last_extracted]

# Step 3: Display and summarize
display(df_incremental.head(10))
print(f"Extracted {len(df_incremental)} rows incrementally since last check.")


Unnamed: 0,id,customer,date,amount,last_updated


Extracted 0 rows incrementally since last check.
