In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic sales data
n_records = 100
dates = [datetime(2025, 5, 1) + timedelta(days=np.random.randint(0, 40)) for _ in range(n_records)]
products = np.random.choice(['Laptop', 'Phone', 'Tablet', 'Headphones', 'Charger'], n_records)
quantities = np.random.randint(1, 5, n_records)
prices = np.random.uniform(10.0, 1000.0, n_records).round(2)
transaction_ids = [f'TX{str(i).zfill(3)}' for i in range(1, n_records + 1)]

# Create DataFrame
data = pd.DataFrame({
    'transaction_id': transaction_ids,
    'sale_date': dates,
    'product': products,
    'quantity': quantities,
    'price': prices
})

# Save to CSV
data.to_csv('custom_data.csv', index=False)
print("Generated and saved custom_data.csv with 100 records.")

Generated and saved custom_data.csv with 100 records.


In [3]:
# Import necessary libraries
import pandas as pd
from datetime import datetime
import os

# Section 1: Full Extraction
print("=== Full Extraction ===")
# Load the entire dataset
df = pd.read_csv('custom_data.csv')
df['sale_date'] = pd.to_datetime(df['sale_date'])

# Display basic stats
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])
print("\nSample of the data:")
print(df.head())
print(f"Extracted {df.shape[0]} rows fully.")


=== Full Extraction ===
Number of rows: 100
Number of columns: 5

Sample of the data:
  transaction_id  sale_date     product  quantity   price
0          TX001 2025-06-08     Charger         3  187.03
1          TX002 2025-05-29      Tablet         2  372.80
2          TX003 2025-05-15  Headphones         4  746.73
3          TX004 2025-05-08      Tablet         1  723.73
4          TX005 2025-05-21      Tablet         3  314.98
Extracted 100 rows fully.


In [4]:
# Section 2: Incremental Extraction
print("\n=== Incremental Extraction ===")
# Read or set the last extraction timestamp
last_extraction_file = 'last_extraction.txt'
if os.path.exists(last_extraction_file):
    with open(last_extraction_file, 'r') as f:
        last_extraction_time = pd.to_datetime(f.read().strip())
else:
    # If no prior extraction, use a default early date
    last_extraction_time = pd.to_datetime('2025-05-01')
print(f"Last extraction time: {last_extraction_time}")

# Filter for records after the last extraction
new_data = df[df['sale_date'] > last_extraction_time]

# Display results
print(f"Extracted {new_data.shape[0]} rows incrementally since last check.")
if new_data.shape[0] > 0:
    print("\nSample of new or updated records:")
    print(new_data.head())
else:
    print("No new or updated records found.")


=== Incremental Extraction ===
Last extraction time: 2025-06-09 00:00:00
Extracted 0 rows incrementally since last check.
No new or updated records found.


In [5]:
# Section 3: Save New Timestamp
if new_data.shape[0] > 0:
    latest_timestamp = df['sale_date'].max()
    with open(last_extraction_file, 'w') as f:
        f.write(str(latest_timestamp))
    print(f"\nUpdated last_extraction.txt with new timestamp: {latest_timestamp}")
else:
    print("\nNo update to last_extraction.txt needed.")


No update to last_extraction.txt needed.
