# - LAB 3  Practicing Extraction in ETL

#### Import libraries

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

### `STEP 1: Generate synthetic data`

In [8]:
# Sample customer list
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']

data = []
start_date = datetime(2025, 4, 1)

# Simulate 60 days of data (April 2 to May 31)
for i in range(1, 61):
    date = start_date + timedelta(days=i)
    for _ in range(random.randint(3, 6)):  # 3 to 6 sales per day
        data.append({
            'id': random.randint(10000, 99999),
            'customer': random.choice(customers),
            'date': date.date().isoformat(),
            'amount': random.randint(100, 2000),
            'last_updated': (date + timedelta(hours=random.randint(0, 23), 
                                              minutes=random.randint(0, 59)) ).isoformat()
        })

####  Create DataFrame

In [9]:
# Create DataFrame
df = pd.DataFrame(data)
# Save to CSV
df.to_csv('sales_data_large.csv', index=False)

# Show the first few rows
df.head(10)

Unnamed: 0,id,customer,date,amount,last_updated
0,55249,eBay,2025-04-02,1574,2025-04-02T21:56:00
1,44835,Amazon,2025-04-02,1612,2025-04-02T11:07:00
2,38184,Amazon,2025-04-02,1662,2025-04-02T15:37:00
3,99984,Target,2025-04-02,1452,2025-04-02T07:35:00
4,28662,BestBuy,2025-04-02,1456,2025-04-02T10:53:00
5,62267,BestBuy,2025-04-03,910,2025-04-03T13:51:00
6,76256,Walmart,2025-04-03,392,2025-04-03T14:58:00
7,20868,Target,2025-04-03,1767,2025-04-03T13:44:00
8,38760,Walmart,2025-04-03,461,2025-04-03T17:01:00
9,77604,Target,2025-04-03,1436,2025-04-03T03:23:00


In [12]:
# Count number of unique days
unique_days = df['date'].nunique()

print(f"Number of unique days with sales records: {unique_days}")


Number of unique days with sales records: 60


 ###  `Section 1: Full Extraction`

In [15]:
# Load entire dataset
# FULL EXTRACTION
df_full = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head(10)

Pulled 277 rows via full extraction.


Unnamed: 0,id,customer,date,amount,last_updated
0,55249,eBay,2025-04-02,1574,2025-04-02 21:56:00
1,44835,Amazon,2025-04-02,1612,2025-04-02 11:07:00
2,38184,Amazon,2025-04-02,1662,2025-04-02 15:37:00
3,99984,Target,2025-04-02,1452,2025-04-02 07:35:00
4,28662,BestBuy,2025-04-02,1456,2025-04-02 10:53:00
5,62267,BestBuy,2025-04-03,910,2025-04-03 13:51:00
6,76256,Walmart,2025-04-03,392,2025-04-03 14:58:00
7,20868,Target,2025-04-03,1767,2025-04-03 13:44:00
8,38760,Walmart,2025-04-03,461,2025-04-03 17:01:00
9,77604,Target,2025-04-03,1436,2025-04-03 03:23:00


### `Section 2: Incremental Extraction`

#### `1. Create the tracking file: last_extraction.txt`



In [17]:
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00")

#### `2.Perform Incremental Extraction`

In [18]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

In [20]:
df = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)

In [22]:
df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()

Pulled 189 new/updated rows since 2025-04-20 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
86,61193,Amazon,2025-04-20,143,2025-04-20 19:09:00
89,97734,Walmart,2025-04-20,876,2025-04-20 23:30:00
90,27954,Amazon,2025-04-21,874,2025-04-21 14:01:00
91,88241,Walmart,2025-04-21,1107,2025-04-21 09:23:00
92,89473,Target,2025-04-21,388,2025-04-21 05:33:00


### `Update the last_extraction.txt`

#### After a successful extraction, update the checkpoint:

In [23]:
# Get the most recent update
new_checkpoint = df['last_updated'].max()
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-05-31 19:16:00
