In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

teams = ['Arsenal', 'Chelsea', 'Liverpool', 'Man City', 'Man United', 'Spurs', 'Barcelona', 'Real Madrid', 'PSG', 'Bayern']
data = []
start_date = datetime(2025, 4, 1)

match_id_counter = 1000

for i in range(1, 61):  # 60 days
    date = start_date + timedelta(days=i)
    num_matches = random.randint(2, 5)  # 2–5 matches per day
    for _ in range(num_matches):
        home, away = random.sample(teams, 2)
        home_score = random.randint(0, 5)
        away_score = random.randint(0, 5)
        last_updated = date + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59))

        data.append({
            'match_id': match_id_counter,
            'home_team': home,
            'away_team': away,
            'date': date.date().isoformat(),
            'home_score': home_score,
            'away_score': away_score,
            'last_updated': last_updated.isoformat()
        })
        match_id_counter += 1

df = pd.DataFrame(data)
df.to_csv('custom_data.csv', index=False)
df


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated
0,1000,Chelsea,Man United,2025-04-02,0,1,2025-04-02T03:07:00
1,1001,Man City,PSG,2025-04-02,2,2,2025-04-02T18:54:00
2,1002,Barcelona,Spurs,2025-04-02,2,3,2025-04-02T11:57:00
3,1003,Man United,PSG,2025-04-02,5,2,2025-04-02T13:37:00
4,1004,Arsenal,PSG,2025-04-03,0,1,2025-04-03T08:53:00
...,...,...,...,...,...,...,...
188,1188,Arsenal,Bayern,2025-05-30,5,3,2025-05-30T20:18:00
189,1189,Chelsea,Man City,2025-05-30,5,2,2025-05-30T13:00:00
190,1190,PSG,Barcelona,2025-05-30,0,4,2025-05-30T19:43:00
191,1191,PSG,Liverpool,2025-05-31,3,5,2025-05-31T14:39:00


In [2]:
# FULL EXTRACTION
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
print(f"Extracted {len(df_full)} rows fully.")
df_full.head()


Extracted 203 rows fully.


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated
0,1000,Chelsea,Man United,2025-04-02,3,5,2025-04-02 14:08:00
1,1001,Man United,Barcelona,2025-04-02,2,5,2025-04-02 10:29:00
2,1002,Real Madrid,Man City,2025-04-03,2,3,2025-04-03 15:51:00
3,1003,Man United,Barcelona,2025-04-03,5,1,2025-04-03 14:38:00
4,1004,Spurs,Real Madrid,2025-04-03,1,2,2025-04-03 00:47:00


In [3]:
# Simulate previous extraction (only run ONCE)
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00")


In [4]:
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)

df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Extracted {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()


Extracted 135 new/updated rows since 2025-04-20 12:00:00.


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated
68,1068,Liverpool,Bayern,2025-04-21,2,2,2025-04-21 11:06:00
69,1069,Arsenal,Spurs,2025-04-21,2,3,2025-04-21 08:55:00
70,1070,Man City,Spurs,2025-04-21,2,4,2025-04-21 03:45:00
71,1071,Arsenal,Barcelona,2025-04-21,0,2,2025-04-21 09:57:00
72,1072,Barcelona,PSG,2025-04-21,0,4,2025-04-21 23:14:00


In [5]:
new_checkpoint = df['last_updated'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-31 17:47:00
