In [1]:
# Import required libraries
import pandas as pd                 # For handling data and saving to CSV
import random                       # For generating random choices/numbers
from datetime import datetime, timedelta  # For working with dates
import faker                        # To generate fake names and realistic dates

# Initialize the Faker generator
fake = faker.Faker()

# Define a list of sample team names
teams = ['Falcons', 'Sharks', 'Tigers', 'Wolves', 'Eagles']

# Initialize an empty list to hold our player records
data = []

# Generate 200 synthetic player performance records
for i in range(200):
    player_id = f'P{i+1:03d}'                       # Create a player ID like P001, P002, ..., P200
    player_name = fake.name()                       # Generate a realistic fake player name
    team = random.choice(teams)                     # Randomly assign one of the 5 teams
    match_date = fake.date_between(                 # Generate a match date within the last 60 days
        start_date='-60d', end_date='today'
    )
    points = random.randint(0, 50)                  # Randomly assign points scored (0–50)
    assists = random.randint(0, 15)                 # Random assists (0–15)
    rebounds = random.randint(0, 20)                # Random rebounds (0–20)
    last_updated = match_date + timedelta(          # Simulate a 'last_updated' datetime for this record
        days=random.randint(0, 5)
    )

    # Add the generated data as a row to our data list
    data.append([
        player_id, player_name, team, match_date,
        points, assists, rebounds, last_updated
    ])

# Convert the list of data into a pandas DataFrame
df = pd.DataFrame(data, columns=[
    'player_id', 'player_name', 'team', 'match_date',
    'points_scored', 'assists', 'rebounds', 'last_updated'
])

# Save the DataFrame to a CSV file named 'custom_data.csv'
df.to_csv('custom_data.csv', index=False)

# Print a confirmation message
print("✅ custom_data.csv generated with 200 rows.")


✅ custom_data.csv generated with 200 rows.


In [2]:
import pandas as pd

# Step 1: Load the entire dataset
df_full = pd.read_csv("custom_data.csv")

# Step 2: Display basic stats
print("Basic Information:")
print(f"Total Rows: {df_full.shape[0]}")
print(f"Total Columns: {df_full.shape[1]}\n")
print("Sample Data:")
display(df_full.head())

# Step 3: Confirm full extraction
print(f"\n✅ Extracted {df_full.shape[0]} rows fully.")


Basic Information:
Total Rows: 200
Total Columns: 8

Sample Data:


Unnamed: 0,player_id,player_name,team,match_date,points_scored,assists,rebounds,last_updated
0,P001,Amy Mitchell,Wolves,2025-04-15,21,1,12,2025-04-18
1,P002,Andrea Nash,Sharks,2025-04-29,6,10,16,2025-04-30
2,P003,Kerri West,Tigers,2025-05-08,45,14,9,2025-05-09
3,P004,Joel Rogers,Eagles,2025-04-22,26,6,9,2025-04-24
4,P005,Craig Hudson,Eagles,2025-05-06,5,10,6,2025-05-09



✅ Extracted 200 rows fully.


In [6]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-05-20 00:00:00") 


In [7]:
from datetime import datetime

# Step 1: Load the dataset again
df = pd.read_csv("custom_data.csv")

# Step 2: Convert the 'Updated_At' column to datetime
df['last_updated'] = pd.to_datetime(df['last_updated'])

# Step 3: Read the last extraction time from file
with open("last_extraction.txt", "r") as f:
    last_extraction_str = f.read().strip()
last_extraction_time = datetime.strptime(last_extraction_str, "%Y-%m-%d %H:%M:%S")

# Step 4: Filter for only new or updated rows
incremental_df = df[df['last_updated'] > last_extraction_time]

# Step 5: Display the result
print(f"✅ Extracted {incremental_df.shape[0]} rows incrementally since last check.\n")
display(incremental_df.head())


✅ Extracted 79 rows incrementally since last check.



Unnamed: 0,player_id,player_name,team,match_date,points_scored,assists,rebounds,last_updated
5,P006,David Ritter,Falcons,2025-06-06,45,7,5,2025-06-06
8,P009,Tanner Fox,Sharks,2025-06-02,48,12,11,2025-06-03
11,P012,Victoria Wolf,Falcons,2025-06-09,33,11,6,2025-06-14
13,P014,Arthur Lewis,Wolves,2025-05-22,21,10,8,2025-05-23
14,P015,Brian Brown,Sharks,2025-05-21,11,14,15,2025-05-25


In [8]:
from datetime import datetime

# Step 1: Get the current time
new_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Step 2: Write it to last_extraction.txt
with open("last_extraction.txt", "w") as f:
    f.write(new_timestamp)

print(f"✅ Updated last_extraction.txt with timestamp: {new_timestamp}")


✅ Updated last_extraction.txt with timestamp: 2025-06-10 18:28:19
