In [1]:

import pandas as pd
from datetime import datetime, timedelta
import random

# Simulate 80 days of event data
events = ['Tech Conference', 'Music Fest', 'Art Expo', 'Startup Pitch', 'Food Fair', 'Gaming Convention']
organizers = ['InnovateAfrica', 'KenyaLive', 'ArtWorld', 'StartupHub', 'Taste360', 'GameOn']

data = []
start_date = datetime(2025, 6,7)

for i in range(1, 81):  # 80 days
    date = start_date + timedelta(days=i)
    for _ in range(random.randint(3, 6)):  # 3–6 events per day
        event = random.choice(events)
        organizer = random.choice(organizers)
        attendees = random.randint(10, 300)
        ticket_price = round(random.uniform(5.0, 100.0), 2)
        total_revenue = round(attendees * ticket_price, 2)
        last_updated = (date + timedelta(
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59)
        )).isoformat()

        data.append({
            'event_id': random.randint(1000, 9999),
            'event_name': event,
            'organizer': organizer,
            'attendees': attendees,
            'ticket_price': ticket_price,
            'total_revenue': total_revenue,
            'event_date': date.date().isoformat(),
            'last_updated': last_updated
        })

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('custom_data.csv', index=False)

print(" Event dataset generated and saved as 'custom_data.csv'")
df


 Event dataset generated and saved as 'custom_data.csv'


Unnamed: 0,event_id,event_name,organizer,attendees,ticket_price,total_revenue,event_date,last_updated
0,5438,Food Fair,KenyaLive,237,7.76,1839.12,2025-06-08,2025-06-08T14:03:00
1,6317,Tech Conference,InnovateAfrica,292,28.16,8222.72,2025-06-08,2025-06-08T22:50:00
2,5971,Tech Conference,StartupHub,230,24.10,5543.00,2025-06-08,2025-06-08T01:02:00
3,6589,Art Expo,InnovateAfrica,66,89.61,5914.26,2025-06-08,2025-06-08T00:15:00
4,3225,Food Fair,StartupHub,217,47.65,10340.05,2025-06-08,2025-06-08T13:01:00
...,...,...,...,...,...,...,...,...
358,2449,Art Expo,KenyaLive,245,50.62,12401.90,2025-08-26,2025-08-26T02:47:00
359,6658,Gaming Convention,ArtWorld,241,6.91,1665.31,2025-08-26,2025-08-26T07:28:00
360,7479,Gaming Convention,KenyaLive,273,54.95,15001.35,2025-08-26,2025-08-26T14:17:00
361,6210,Art Expo,InnovateAfrica,156,76.98,12008.88,2025-08-26,2025-08-26T18:19:00


In [2]:
# FULL EXTRACTION
import pandas as pd

# Load the full dataset
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])

# Display extraction summary
print(f" Extracted {len(df_full)} event rows fully.")

# Show a sample
df_full.head()


 Extracted 363 event rows fully.


Unnamed: 0,event_id,event_name,organizer,attendees,ticket_price,total_revenue,event_date,last_updated
0,5438,Food Fair,KenyaLive,237,7.76,1839.12,2025-06-08,2025-06-08 14:03:00
1,6317,Tech Conference,InnovateAfrica,292,28.16,8222.72,2025-06-08,2025-06-08 22:50:00
2,5971,Tech Conference,StartupHub,230,24.1,5543.0,2025-06-08,2025-06-08 01:02:00
3,6589,Art Expo,InnovateAfrica,66,89.61,5914.26,2025-06-08,2025-06-08 00:15:00
4,3225,Food Fair,StartupHub,217,47.65,10340.05,2025-06-08,2025-06-08 13:01:00


In [3]:
# Set initial last extraction time for event logs (e.g., midway through the event range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00")


In [4]:
# INCREMENTAL EXTRACTION
import pandas as pd

# Read the last extraction timestamp
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

# Load event dataset
df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])

# Convert to datetime
last_extraction_time = pd.to_datetime(last_extraction)

# Filter only events updated after the last extraction
df_incremental = df[df['last_updated'] > last_extraction_time]

# Display how many new/updated events were extracted
print(f" Extracted {len(df_incremental)} event rows incrementally since last check.")

# Show sample
df_incremental.head()


 Extracted 363 event rows incrementally since last check.


Unnamed: 0,event_id,event_name,organizer,attendees,ticket_price,total_revenue,event_date,last_updated
0,5438,Food Fair,KenyaLive,237,7.76,1839.12,2025-06-08,2025-06-08 14:03:00
1,6317,Tech Conference,InnovateAfrica,292,28.16,8222.72,2025-06-08,2025-06-08 22:50:00
2,5971,Tech Conference,StartupHub,230,24.1,5543.0,2025-06-08,2025-06-08 01:02:00
3,6589,Art Expo,InnovateAfrica,66,89.61,5914.26,2025-06-08,2025-06-08 00:15:00
4,3225,Food Fair,StartupHub,217,47.65,10340.05,2025-06-08,2025-06-08 13:01:00


In [5]:
# Get the most recent event update timestamp
new_checkpoint = df['last_updated'].max()

# Save it to the checkpoint file
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())

print(f" Updated last_extraction.txt to latest event timestamp: {new_checkpoint}")

 Updated last_extraction.txt to latest event timestamp: 2025-08-26 18:19:00
