# Generating Dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

NUM_DAYS = 60  
SALES_PER_DAY_RANGE = (3, 8)  
OUTPUT_FILENAME = 'custom_data.csv'

CUSTOMERS = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay', 'Microsoft', 'Google', 'Apple', 'Meta']
PRODUCT_CATEGORIES = ['Electronics', 'Home Goods', 'Apparel', 'Books', 'Groceries', 'Software', 'Tools', 'Sports']
PAYMENT_METHODS = ['Credit Card', 'Debit Card', 'PayPal', 'Bank Transfer']
START_DATE = datetime(2025, 4, 1)

print(f"Generating synthetic sales data for {NUM_DAYS} days starting from {START_DATE.date()}...")

all_records = []
record_id_counter = 1 

for i in range(NUM_DAYS):
    current_date = START_DATE + timedelta(days=i)
    num_sales_today = random.randint(SALES_PER_DAY_RANGE[0], SALES_PER_DAY_RANGE[1])

    for _ in range(num_sales_today):
        transaction_time = current_date + timedelta(
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59),
            seconds=random.randint(0, 59)
        )

        last_updated_time = transaction_time + timedelta(
            hours=random.randint(0, 2), 
            minutes=random.randint(0, 59)
        )
        if last_updated_time.date() > current_date.date():
             last_updated_time = current_date.replace(hour=23, minute=59, second=59)

        all_records.append({
            'record_id': record_id_counter, # Unique ID for each record
            'transaction_date': current_date.date().isoformat(), # Date of the transaction
            'transaction_timestamp': transaction_time.isoformat(), # Exact timestamp of the transaction
            'customer_name': random.choice(CUSTOMERS), # Customer who made the purchase
            'product_category': random.choice(PRODUCT_CATEGORIES), # Category of product sold
            'amount': round(random.uniform(10.00, 2000.00), 2), # Sales amount, up to two decimal places
            'quantity': random.randint(1, 10), # Quantity of items sold
            'payment_method': random.choice(PAYMENT_METHODS), # Method of payment
            'last_updated_timestamp': last_updated_time.isoformat(), # Timestamp when record was last modified/created
            'status': random.choice(['Completed', 'Pending', 'Cancelled']) # Status of the transaction
        })
        record_id_counter += 1
df = pd.DataFrame(all_records)

print(f"\nGenerated {len(df)} records.")
print("\nFirst 5 rows of the generated data:")
print(df.head())
print("\nData types:")
print(df.info())

try:
    df.to_csv(OUTPUT_FILENAME, index=False)
    print(f"\nData successfully saved to '{OUTPUT_FILENAME}'")
except Exception as e:
    print(f"Error saving data to CSV: {e}")



Generating synthetic sales data for 60 days starting from 2025-04-01...

Generated 338 records.

First 5 rows of the generated data:
   record_id transaction_date transaction_timestamp customer_name  \
0          1       2025-04-01   2025-04-01T17:49:57        Costco   
1          2       2025-04-01   2025-04-01T10:36:18         Apple   
2          3       2025-04-01   2025-04-01T17:16:47        Target   
3          4       2025-04-01   2025-04-01T18:15:48        Costco   
4          5       2025-04-01   2025-04-01T02:37:35        Amazon   

  product_category   amount  quantity payment_method last_updated_timestamp  \
0      Electronics   929.94         1         PayPal    2025-04-01T18:49:57   
1        Groceries   130.17         4         PayPal    2025-04-01T12:59:18   
2       Home Goods  1333.76         8    Credit Card    2025-04-01T18:19:47   
3            Tools  1115.54         9         PayPal    2025-04-01T20:41:48   
4         Software   633.14         6    Credit Card    2

# Full extraction
- might be slow for larger datasets


In [2]:
DATA_FILE = 'custom_data.csv'
print(f"--- Starting Full Extraction from {DATA_FILE} ---")
try:
    df_full_extraction = pd.read_csv(DATA_FILE)

    print("\nBasic Dataset Information:")
    print(f"Number of rows: {df_full_extraction.shape[0]}")
    print(f"Number of columns: {df_full_extraction.shape[1]}")
    print("\nFirst 5 rows:")
    print(df_full_extraction.head())
    print("\nData types:")
    print(df_full_extraction.info())

    print(f"\nExtracted {df_full_extraction.shape[0]} rows fully.")

except FileNotFoundError:
    print(f"Error: The file '{DATA_FILE}' was not found. Please ensure it's in the same directory.")
    df_full_extraction = pd.DataFrame() # Create an empty DataFrame to avoid errors later
except Exception as e:
    print(f"An error occurred during full extraction: {e}")
    df_full_extraction = pd.DataFrame() # Create an empty DataFrame to avoid errors later



--- Starting Full Extraction from custom_data.csv ---

Basic Dataset Information:
Number of rows: 338
Number of columns: 10

First 5 rows:
   record_id transaction_date transaction_timestamp customer_name  \
0          1       2025-04-01   2025-04-01T17:49:57        Costco   
1          2       2025-04-01   2025-04-01T10:36:18         Apple   
2          3       2025-04-01   2025-04-01T17:16:47        Target   
3          4       2025-04-01   2025-04-01T18:15:48        Costco   
4          5       2025-04-01   2025-04-01T02:37:35        Amazon   

  product_category   amount  quantity payment_method last_updated_timestamp  \
0      Electronics   929.94         1         PayPal    2025-04-01T18:49:57   
1        Groceries   130.17         4         PayPal    2025-04-01T12:59:18   
2       Home Goods  1333.76         8    Credit Card    2025-04-01T18:19:47   
3            Tools  1115.54         9         PayPal    2025-04-01T20:41:48   
4         Software   633.14         6    Credit Car

# Incremental extraction
- Definition:
    - Pull only new or updated records since the last pull.

    - We use a last_extraction.txt file to remember the last timestamp.
- Pros:
    - Efficient for large datasets
    -  Reduces unnecessary processing
- Cons:
    - Requires tracking system
    - Assumes "last_updated" is reliable

In [5]:
import os

LAST_EXTRACTION_FILE = 'last_extraction.txt'

print(f"\n--- Starting Incremental Extraction ---")
def get_last_extraction_timestamp(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            try:
                timestamp_str = f.read().strip()
                if timestamp_str:
                    return datetime.fromisoformat(timestamp_str)
                else:
                    print(f"Warning: '{file_path}' is empty. Defaulting to a very old timestamp.")
                    return datetime.min 
            except ValueError:
                print(f"Warning: Could not parse timestamp in '{file_path}'. Defaulting to a very old timestamp.")
                return datetime.min # Default if parsing fails
    else:
        print(f"'{file_path}' not found. Assuming first incremental extraction.")
        return datetime.min 

last_extraction_time = get_last_extraction_timestamp(LAST_EXTRACTION_FILE)
print(f"Last extraction timestamp: {last_extraction_time}")

try:
    df_current_data = pd.read_csv(DATA_FILE)
    df_current_data['last_updated_timestamp'] = pd.to_datetime(df_current_data['last_updated_timestamp'], errors='coerce')
    df_current_data_valid_timestamps = df_current_data.dropna(subset=['last_updated_timestamp'])
    df_incremental_extraction = df_current_data_valid_timestamps[
        df_current_data_valid_timestamps['last_updated_timestamp'] > last_extraction_time
    ].copy() 
    print(f"\nExtracted {df_incremental_extraction.shape[0]} rows incrementally since last check ({last_extraction_time}).")

    if not df_incremental_extraction.empty:
        print("\nIncremental data extracted (first 5 rows):")
        print(df_incremental_extraction.head())
    else:
        print("No new or updated records found.")

except FileNotFoundError:
    print(f"Error: The file '{DATA_FILE}' was not found. Please ensure it's in the same directory.")
except Exception as e:
    print(f"An error occurred during incremental extraction: {e}")
    df_incremental_extraction = pd.DataFrame() 



--- Starting Incremental Extraction ---
Last extraction timestamp: 2025-05-31 18:16:00

Extracted 0 rows incrementally since last check (2025-05-31 18:16:00).
No new or updated records found.


# After a successful extraction, update the checkpoint:

In [6]:
LAST_EXTRACTION_FILE = 'last_extraction.txt'
print(f"\n--- Saving New Timestamp ---")
current_extraction_time = datetime.now()
try:
    with open(LAST_EXTRACTION_FILE, 'w') as f:
        f.write(current_extraction_time.isoformat())
    print(f"New extraction timestamp saved: {current_extraction_time.isoformat()}")
except Exception as e:
    print(f"Error saving new timestamp to '{LAST_EXTRACTION_FILE}': {e}")





--- Saving New Timestamp ---
New extraction timestamp saved: 2025-06-09T20:56:34.567175
