In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Simulate 60 days of data
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']
data = []
start_date = datetime(2025, 4, 1)
for i in range(1, 61):
    date = start_date + timedelta(days=i)
    for _ in range(random.randint(3, 6)):
        data.append({
            'id': random.randint(1000, 9999),
            'customer': random.choice(customers),
            'date': date.date().isoformat(),
            'amount': random.randint(100, 2000),
            'last_updated': (date + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59))).isoformat()
        })

df = pd.DataFrame(data)
df.to_csv('sales_data_large.csv', index=False)
df.head(60)


Unnamed: 0,id,customer,date,amount,last_updated
0,4660,Costco,2025-04-02,1023,2025-04-02T15:50:00
1,8005,BestBuy,2025-04-02,115,2025-04-02T10:09:00
2,3090,Amazon,2025-04-02,725,2025-04-02T18:17:00
3,6273,Target,2025-04-03,898,2025-04-03T11:22:00
4,7116,Amazon,2025-04-03,624,2025-04-03T05:11:00
5,1250,Costco,2025-04-03,202,2025-04-03T07:10:00
6,5809,Amazon,2025-04-03,1329,2025-04-03T23:52:00
7,5749,Costco,2025-04-03,947,2025-04-03T20:00:00
8,9810,BestBuy,2025-04-04,1894,2025-04-04T10:57:00
9,6395,BestBuy,2025-04-04,134,2025-04-04T05:56:00


In [3]:
# FULL EXTRACTION
df_full = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
print(f"Extracted {len(df_full)} rows fully.")
df_full.head()


Extracted 278 rows fully.


Unnamed: 0,id,customer,date,amount,last_updated
0,4660,Costco,2025-04-02,1023,2025-04-02 15:50:00
1,8005,BestBuy,2025-04-02,115,2025-04-02 10:09:00
2,3090,Amazon,2025-04-02,725,2025-04-02 18:17:00
3,6273,Target,2025-04-03,898,2025-04-03 11:22:00
4,7116,Amazon,2025-04-03,624,2025-04-03 05:11:00


In [4]:
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00")


In [5]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

df = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)

df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Extracted {len(df_incremental)} rows incrementally since {last_extraction}.")
df_incremental.head()


Extracted 190 rows incrementally since 2025-04-20 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
84,8330,eBay,2025-04-20,466,2025-04-20 16:36:00
88,5264,eBay,2025-04-20,1625,2025-04-20 20:04:00
90,3447,Amazon,2025-04-21,1575,2025-04-21 22:09:00
91,7170,BestBuy,2025-04-21,1904,2025-04-21 12:55:00
92,1131,Target,2025-04-21,1306,2025-04-21 07:19:00


In [6]:
# Update checkpoint
new_checkpoint = df['last_updated'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-31 21:01:00
