In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

teams = ['Arsenal', 'Chelsea', 'Liverpool', 'Man City', 'Man United', 'Spurs', 'Barcelona', 'Real Madrid', 'PSG', 'Bayern']
data = []
start_date = datetime(2025, 4, 1)

match_id_counter = 1000

for i in range(1, 61):  # 60 days
    date = start_date + timedelta(days=i)
    num_matches = random.randint(2, 5)  # 2–5 matches per day
    for _ in range(num_matches):
        home, away = random.sample(teams, 2)
        home_score = random.randint(0, 5)
        away_score = random.randint(0, 5)
        last_updated = date + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59))

        data.append({
            'match_id': match_id_counter,
            'home_team': home,
            'away_team': away,
            'date': date.date().isoformat(),
            'home_score': home_score,
            'away_score': away_score,
            'last_updated': last_updated.isoformat()
        })
        match_id_counter += 1

df = pd.DataFrame(data)
df.to_csv('custom_data.csv', index=False)
df.head()


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated
0,1000,PSG,Man City,2025-04-02,4,3,2025-04-02T04:14:00
1,1001,Arsenal,Spurs,2025-04-02,2,0,2025-04-02T08:36:00
2,1002,Spurs,Liverpool,2025-04-02,5,2,2025-04-02T08:56:00
3,1003,Arsenal,Real Madrid,2025-04-02,0,3,2025-04-02T22:58:00
4,1004,Man United,Barcelona,2025-04-02,2,5,2025-04-02T06:01:00


In [44]:


# Section 1: Full Extraction
import pandas as pd

# Load full dataset
df = pd.read_csv('custom_data.csv')

# Display basic stats
print(f"Extracted {len(df)} rows fully.")
print(f"Columns: {list(df.columns)}")
display(df.head())



Extracted 197 rows fully.
Columns: ['match_id', 'home_team', 'away_team', 'date', 'home_score', 'away_score', 'last_updated']


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated
0,1000,PSG,Man City,2025-04-02,4,3,2025-04-02T04:14:00
1,1001,Arsenal,Spurs,2025-04-02,2,0,2025-04-02T08:36:00
2,1002,Spurs,Liverpool,2025-04-02,5,2,2025-04-02T08:56:00
3,1003,Arsenal,Real Madrid,2025-04-02,0,3,2025-04-02T22:58:00
4,1004,Man United,Barcelona,2025-04-02,2,5,2025-04-02T06:01:00


In [49]:
# Section 2: Incremental Extraction
from datetime import datetime

# Read and clean timestamp from file
with open('last_extraction.txt', 'r') as f:
    raw_time = f.read().strip().replace('T', ' ')  # Replace T with space
    last_time = datetime.strptime(raw_time, '%Y-%m-%d %H:%M:%S')

# Convert 'last_updated' to datetime
df['last_updated'] = pd.to_datetime(df['last_updated'])

# Filter new records
incremental_df = df[df['last_updated'] > last_time]

print(f"\nExtracted {len(incremental_df)} rows incrementally since last check.")
display(incremental_df.head())



Extracted 0 rows incrementally since last check.


Unnamed: 0,match_id,home_team,away_team,date,home_score,away_score,last_updated


In [51]:

# Section 3: Save New Timestamp
if not incremental_df.empty:
    new_time = incremental_df['last_updated'].max()
    with open('last_extraction.txt', 'w') as f:
        f.write(new_time.strftime('%Y-%m-%d %H:%M:%S'))
    print(f"Timestamp updated to: {new_time}")
else:
    print("No new data. Timestamp not updated.")


No new data. Timestamp not updated.
