In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

teams = ['Arsenal', 'Chelsea', 'Liverpool', 'Man City', 'Man United', 'Spurs', 'Barcelona', 'Real Madrid', 'PSG', 'Bayern']
data = []
start_date = datetime(2025, 4, 1)

match_id_counter = 1000

for i in range(1, 61):  # 60 days
    date = start_date + timedelta(days=i)
    num_matches = random.randint(2, 5)  # 2–5 matches per day
    for _ in range(num_matches):
        home, away = random.sample(teams, 2)
        home_score = random.randint(0, 5)
        away_score = random.randint(0, 5)
        last_updated = date + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59))

        data.append({
            'match_id': match_id_counter,
            'home_team': home,
            'away_team': away,
            'date': date.date().isoformat(),
            'home_score': home_score,
            'away_score': away_score,
            'last_updated': last_updated.isoformat()
        })
        match_id_counter += 1

df = pd.DataFrame(data)
df.to_csv('custom_data.csv', index=False)
df.head()


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated
0,1000,Barcelona,Arsenal,2025-04-02,2,5,2025-04-02T10:03:00
1,1001,PSG,Man United,2025-04-02,0,5,2025-04-02T23:29:00
2,1002,PSG,Man United,2025-04-03,1,0,2025-04-03T17:58:00
3,1003,Barcelona,Arsenal,2025-04-03,5,1,2025-04-03T22:12:00
4,1004,Barcelona,PSG,2025-04-03,1,4,2025-04-03T14:45:00


In [9]:
# FULL EXTRACTION
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
print(f"Extracted {len(df_full)} rows fully.")
df_full.head()


Extracted 218 rows fully.


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated
0,1000,Barcelona,Arsenal,2025-04-02,2,5,2025-04-02 10:03:00
1,1001,PSG,Man United,2025-04-02,0,5,2025-04-02 23:29:00
2,1002,PSG,Man United,2025-04-03,1,0,2025-04-03 17:58:00
3,1003,Barcelona,Arsenal,2025-04-03,5,1,2025-04-03 22:12:00
4,1004,Barcelona,PSG,2025-04-03,1,4,2025-04-03 14:45:00


In [10]:
# Simulate previous extraction (only run ONCE)
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00")


In [14]:
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)

df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Extracted {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()


Extracted 0 new/updated rows since .


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated


In [16]:
new_checkpoint = df['last_updated'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-31 08:20:00
