In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Simulate 60 days of user activity logs
users = [f"user{str(i).zfill(3)}" for i in range(1, 11)]
actions = ["login", "logout", "view_page", "purchase"]
data = []
start_date = datetime(2025, 4, 1)

for i in range(60):  # 60 days
    day = start_date + timedelta(days=i)
    for _ in range(random.randint(5, 10)):  # 5‚Äì10 events per day
        user = random.choice(users)
        action = random.choice(actions)
        timestamp = day + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59))
        data.append({
            "user_id": user,
            "action": action,
            "device": random.choice(["mobile", "desktop", "tablet"]),
            "timestamp": timestamp.isoformat()
        })

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("user_activity_logs.csv", index=False)
df.head()


Unnamed: 0,user_id,action,device,timestamp
0,user006,logout,desktop,2025-04-01T22:28:00
1,user008,purchase,desktop,2025-04-01T12:14:00
2,user009,purchase,mobile,2025-04-01T03:06:00
3,user004,login,desktop,2025-04-01T03:01:00
4,user010,logout,desktop,2025-04-01T07:46:00


In [2]:
# FULL EXTRACTION
df_full = pd.read_csv("user_activity_logs.csv", parse_dates=["timestamp"])

print("üîÅ FULL EXTRACTION")
print(f"Pulled {len(df_full)} rows from user activity logs.")
df_full.head()


üîÅ FULL EXTRACTION
Pulled 441 rows from user activity logs.


Unnamed: 0,user_id,action,device,timestamp
0,user006,logout,desktop,2025-04-01 22:28:00
1,user008,purchase,desktop,2025-04-01 12:14:00
2,user009,purchase,mobile,2025-04-01 03:06:00
3,user004,login,desktop,2025-04-01 03:01:00
4,user010,logout,desktop,2025-04-01 07:46:00


In [3]:
# Create last_extraction.txt with an old timestamp
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20T12:00:00")


In [4]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

df = pd.read_csv("user_activity_logs.csv", parse_dates=["timestamp"])
last_extraction_time = pd.to_datetime(last_extraction)

df_incremental = df[df['timestamp'] > last_extraction_time]

print("üîÑ INCREMENTAL EXTRACTION")
print(f"Extracted {len(df_incremental)} rows since {last_extraction}.")
df_incremental.head()


üîÑ INCREMENTAL EXTRACTION
Extracted 291 rows since 2025-04-20T12:00:00.


Unnamed: 0,user_id,action,device,timestamp
150,user007,purchase,mobile,2025-04-21 17:46:00
151,user005,view_page,tablet,2025-04-21 07:31:00
152,user002,logout,mobile,2025-04-21 20:24:00
153,user007,logout,tablet,2025-04-21 02:40:00
154,user010,login,mobile,2025-04-21 02:22:00


In [7]:
# Get most recent timestamp
new_checkpoint = df['timestamp'].max()

# Save it to last_extraction.txt
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())

print(f"‚úÖ Updated last_extraction.txt to {new_checkpoint}")


‚úÖ Updated last_extraction.txt to 2025-05-30 23:31:00
