## Load Environment Variables


In [1]:
import dotenv

dotenv.load_dotenv()


True

## Import Stuff


In [2]:
from datetime import datetime
from pathlib import Path

import hopsworks
import kagglehub
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


## Connect to Hopsworks and Load Recent Data


In [3]:
project = hopsworks.login(project="ATP_Tennis_Prediction")
fs = project.get_feature_store()

2026-01-05 17:05:05,806 INFO: Initializing external client
2026-01-05 17:05:05,807 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-05 17:05:06,673 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3204


In [4]:
tennis_matches_fg = fs.get_feature_group(
    name="tennis_matches",
    version=2,
)

In [5]:
existing_df = tennis_matches_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (30.56s) 


In [6]:
if "Date" in existing_df.columns:
    existing_df["Date"] = pd.to_datetime(existing_df["Date"])
    date_col = "Date"
elif "date" in existing_df.columns:
    existing_df["date"] = pd.to_datetime(existing_df["date"])
    date_col = "date"
else:
    raise ValueError("Cannot find Date or date column in feature store!")

In [7]:
latest_date_in_fs = existing_df[date_col].max()
print(f"latest date -> {latest_date_in_fs}")

latest date -> 2025-11-16 00:00:00


## Get Latest Data


In [8]:
path = kagglehub.dataset_download("dissfya/atp-tennis-2000-2023daily-pull")
print(f"Data Path -> {path}")
dataset_dir = Path(path)
data_file = dataset_dir / "atp_tennis.csv"

Data Path -> C:\Users\dtanu\.cache\kagglehub\datasets\dissfya\atp-tennis-2000-2023daily-pull\versions\971


In [9]:
df_latest = pd.read_csv(data_file)
df_latest["Date"] = pd.to_datetime(df_latest["Date"])
df_latest = df_latest.sort_values(by="Date").reset_index(drop=True)

In [10]:
print(f"latest dataset length -> {len(df_latest):,}")
print(f"Date Range -> {df_latest['Date'].min()} to {df_latest['Date'].max()}")

latest dataset length -> 66,681
Date Range -> 2000-01-03 00:00:00 to 2025-11-16 00:00:00


In [11]:
df_latest.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,El Aynaoui Y.,Dupuis A.,El Aynaoui Y.,33,78,-1,-1,-1.0,-1.0,4-6 6-3 6-2
2,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Arazi H.,Cherkasov A.,Cherkasov A.,35,206,-1,-1,-1.0,-1.0,2-6 4-6
3,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Caratti C.,Heuberger I.,Caratti C.,211,219,-1,-1,-1.0,-1.0,7-6 4-6 6-4
4,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Al-Alawi S.K.,Berasategui A.,Berasategui A.,-1,60,-1,-1,-1.0,-1.0,3-6 4-6


## Write Latest Data


In [12]:
df_new = df_latest[df_latest["Date"] > latest_date_in_fs].copy()

In [13]:
if len(df_new) == 0:
    print("No new matches to process. Feature store is up to date!")
    print(f"latest match in fs -> {latest_date_in_fs}")
else:
    print(
        f"Date range of new matches: {df_new['Date'].min()} to {df_new['Date'].max()}"
    )
    print("First few new matches")
    print(df_new[["Date", "Tournament", "Player_1", "Player_2", "Winner"]].head(10))


No new matches to process. Feature store is up to date!
latest match in fs -> 2025-11-16 00:00:00


In [14]:
from utils import (
    compute_derived_features,
    compute_match_percentages,
    compute_player_match_history,
    create_symmetric_dataset,
    encode_categorical_features,
    final_train_data,
)

if len(df_new) > 0:
    print("Compute new match history")
    new_match_history = compute_player_match_history(df_latest)
    print("Compute new match percentages")
    new_match_percentages = compute_match_percentages(new_match_history)
    print("Encode new match info")
    new_encoded = encode_categorical_features(new_match_percentages)
    print("Derive features from new matches ")
    new_derived = compute_derived_features(new_encoded)
    print("Create new symmetric match data")
    new_symmetric = create_symmetric_dataset(new_derived)
    print("Final Updated Data")
    new_final = final_train_data(new_symmetric)
    new_final["timestamp"] = datetime.now()
    latest_date_str = latest_date_in_fs.strftime("%Y-%m-%d")
    new_final["Date"] = new_final["Date"].dt.strftime("%Y-%m-%d")
    new_final = new_final[new_final["Date"] > latest_date_str].copy()
    print(f"Processed -> {len(new_final):,} rows (symmetric)")
    print(f"Columns -> {len(new_final.columns)}")


## Insert into Hopsworks


In [15]:
if len(df_new) > 0:
    tennis_matches_fg.insert(new_final)