In [83]:
from src import loader
from riix.models.elo import Elo
from riix.models.glicko2 import Glicko2
from riix.models.ttt import TrueSkillThroughTime
from riix.utils import MatchupDataset, split_matchup_dataset, generate_matchup_data
from riix.metrics import binary_metrics_suite
#from riix.models.eloplus import EloPlus
from riix.models.glicko import Glicko
import polars as pl
import numpy as np
import trueskill_through_time as ttt
import cProfile
from scipy.optimize import minimize_scalar


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
mdb_file = r"/mnt/c/Program Files (x86)/OnCourt/OnCourt.mdb"
password = "qKbE8lWacmYQsZ2"
atp = loader.TennisLoader(mdb_file, password)
atp.clean_games()

In [73]:
dataset = MatchupDataset(
    atp.to_riix_format(sample_games=500000),
    competitor_cols = ['P1', 'P2'],
    outcome_col = 'Result',
    datetime_col = 'Date',
    rating_period = '1D',
)

loaded dataset with:
500000 matchups
24414 unique competitors
3729 rating periods of length 1D


In [74]:
train_dataset, test_dataset = split_matchup_dataset(dataset, test_fraction=0.01)
print(f'{len(train_dataset)=}, {len(test_dataset)=}')

split into train_dataset of length 495000 and test_dataset of length 5000
len(train_dataset)=495000, len(test_dataset)=5000


In [75]:
model_Elo = Elo(train_dataset.competitors)
model_Elo.fit_dataset(train_dataset)
test_probs_Elo = model_Elo.fit_dataset(test_dataset, return_pre_match_probs=True)
test_metrics_Elo = binary_metrics_suite(probs=test_probs_Elo, outcomes=test_dataset.outcomes)
print(test_metrics_Elo)

{'accuracy': np.float64(0.6952), 'accuracy_without_draws': np.float64(0.6952), 'log_loss': np.float64(0.5701786642923271), 'brier_score': np.float64(0.1942084987564474)}


In [76]:
model_Glicko2 = Glicko2(train_dataset.competitors)
model_Glicko2.fit_dataset(train_dataset)
test_probs_Glicko2 = model_Glicko2.fit_dataset(test_dataset, return_pre_match_probs=True)
test_metrics_Glicko2 = binary_metrics_suite(probs=test_probs_Glicko2, outcomes=test_dataset.outcomes)
print(test_metrics_Glicko2)

{'accuracy': np.float64(0.7108), 'accuracy_without_draws': np.float64(0.7108), 'log_loss': np.float64(0.545839529411334), 'brier_score': np.float64(0.18508815916194454)}


In [79]:
model_ttt = TrueSkillThroughTime(train_dataset, beta = 0.65)

In [None]:
pr = cProfile.Profile()
pr.enable()
model_ttt.iterate(5)
pr.disable()



Iteration =  0 , step =  (0.0009617597709858927, 0.0002067906422158483)
End


NameError: name 'pstats' is not defined

In [None]:
pr.print_stats()

         282934571 function calls (280927791 primitive calls) in 183.243 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 3767132350.py:1(<module>)
     7216    0.009    0.000    0.013    0.000 batch.py:27(__len__)
  5632502    7.678    0.000   70.692    0.000 batch.py:46(posterior)
    14432    1.363    0.000   52.800    0.004 batch.py:49(posteriors)
  1999564    5.348    0.000   34.825    0.000 batch.py:55(within_prior)
   999782    0.813    0.000   38.116    0.000 batch.py:61(within_priors)
  1999564    1.129    0.000   35.954    0.000 batch.py:62(<listcomp>)
     7216    7.051    0.001  109.500    0.015 batch.py:64(iteration)
   908318    0.591    0.000    4.144    0.000 batch.py:85(forward_prior_out)
   908151    1.533    0.000    6.440    0.000 batch.py:88(backward_prior_out)
     3608    1.298    0.000   54.883    0.015 batch.py:92(new_backward_info)
     3608    1.205    

In [81]:
test_probs_ttt = model_ttt.fit_dataset(test_dataset, return_pre_match_probs=True)

Limited iteration: 0
Limited iteration: 1
Limited iteration: 2
Limited iteration: 3
Limited iteration: 4
Iteration =  0 , step =  (10.850325051799327, 4.714155757829291)
Iteration =  1 , step =  (0.36786656339321766, 0.0738494359808739)
Iteration =  2 , step =  (0.008681859369442435, 0.03091722962626675)
Iteration =  3 , step =  (0.006615427907657789, 0.0008445541509489729)
End
Limited iteration: 0
Limited iteration: 1
Limited iteration: 2
Limited iteration: 3
Limited iteration: 4
Iteration =  0 , step =  (1.1915678360032453, 0.3807902630936333)
Iteration =  1 , step =  (0.04857845077452616, 0.047723519560588734)
Iteration =  2 , step =  (0.006020208759512258, 0.004843707893582128)
End
Limited iteration: 0
Limited iteration: 1
Limited iteration: 2
Limited iteration: 3
Limited iteration: 4
Iteration =  0 , step =  (1.0393060003957735, 0.16531390577562743)
Iteration =  1 , step =  (0.04761663428900409, 0.02498778301708704)
Iteration =  2 , step =  (0.004549812887353788, 0.001223996948558

In [82]:
test_metrics_ttt = binary_metrics_suite(probs=test_probs_ttt, outcomes=test_dataset.outcomes)
print(test_metrics_ttt)

{'accuracy': np.float64(0.7216), 'accuracy_without_draws': np.float64(0.7216), 'log_loss': np.float64(0.543611409578523), 'brier_score': np.float64(0.18294739639909088)}


In [26]:
test_probs_ttt = model_ttt.fit_dataset(test_dataset)

TypeError: TrueSkillThroughTime.add_games() takes from 2 to 3 positional arguments but 5 were given

In [136]:
def objective_function_EloPlus(lambda_reg, dataset, train_dataset, test_dataset):
    """
    Objective function to minimize. Takes gamma and returns Brier score.
    
    Parameters:
        gamma: The parameter to optimize
        dataset: Dataset containing competitors
        train_dataset: Training dataset
        test_dataset: Test dataset for evaluation
        
    Returns:
        float: Brier score (lower is better)
    """
    # Initialize model with current gamma
    model = EloPlus(dataset.competitors, lambda_reg=lambda_reg, total_iterations = 50, update_method = 'batched')
    model.fit_dataset(train_dataset)
    test_probs = model.fit_dataset(test_dataset, return_pre_match_probs=True)
    test_metrics = binary_metrics_suite(probs=test_probs, outcomes=test_dataset.outcomes)
    brier = test_metrics['brier_score']
    print("lambda_reg of: ", lambda_reg, " gives brier of: ", brier)
    return test_metrics['brier_score']

In [None]:
def objective_function_Elo(k, dataset, train_dataset, test_dataset):
    """
    Objective function to minimize. Takes gamma and returns Brier score.
        
    Returns:
        float: Brier score (lower is better)
    """
    # Initialize model with current gamma
    model = Elo(dataset.competitors, k=k)
    model.fit_dataset(train_dataset)
    test_probs = model.fit_dataset(test_dataset, return_pre_match_probs=True)
    test_metrics = binary_metrics_suite(probs=test_probs, outcomes=test_dataset.outcomes)
    brier = test_metrics['brier_score']
    print("K of: ", k, " gives brier of: ", brier)
    return brier

In [92]:
def objective_function_Glicko(c, dataset, train_dataset, test_dataset):
    """
    Objective function to minimize. Takes gamma and returns Brier score.
        
    Returns:
        float: Brier score (lower is better)
    """
    # Initialize model with current gamma
    model = Glicko(dataset.competitors, c = c)
    model.fit_dataset(train_dataset)
    test_probs = model.fit_dataset(test_dataset, return_pre_match_probs=True)
    test_metrics = binary_metrics_suite(probs=test_probs, outcomes=test_dataset.outcomes)
    brier = test_metrics['brier_score']
    print("c of: ", c, " gives brier of: ", brier)
    return brier

In [69]:
def objective_function_ttt(beta, dataset, train_dataset, test_dataset):
    """
    Objective function to minimize. Takes gamma and returns Brier score.
        
    Returns:
        float: Brier score (lower is better)
    """
    # Initialize model with current gamma
    model = TrueSkillThroughTime(train_dataset, beta = beta)
    model.iterate(5)
    test_probs = model.fit_dataset(test_dataset, return_pre_match_probs=True, iterations = 5)
    test_metrics = binary_metrics_suite(probs=test_probs, outcomes=test_dataset.outcomes)
    brier = test_metrics['brier_score']
    print("Beta of: ", beta, " gives brier of: ", brier)
    return brier

In [70]:
result = minimize_scalar(
    objective_function_ttt,
    args=(dataset, train_dataset, test_dataset),
    method='brent',  # Best general-purpose 1D optimizer
)

Iteration =  0 , step =  (17.0282089382114, 5.655630358950269)
Iteration =  1 , step =  (8.879721459636873, 2.209164802281448)
Iteration =  2 , step =  (6.121663292227514, 1.6556274525094823)
Iteration =  3 , step =  (2.468722529145867, 0.6672988785335886)
Iteration =  4 , step =  (1.1472482672578521, 0.3722250327117397)
End
Limited iteration: 0
Limited iteration: 1
Limited iteration: 2
Limited iteration: 3
Limited iteration: 4
Iteration =  0 , step =  (12.596027960104927, 5.026531001012914)
Iteration =  1 , step =  (6.0155176198913125, 3.161192659036378)
Iteration =  2 , step =  (1.2047535499509334, 0.5338058919625286)
Iteration =  3 , step =  (0.18828318648172016, 0.057710441215105046)
Iteration =  4 , step =  (0.056342955536241135, 0.019461579838512932)
End
Limited iteration: 0
Limited iteration: 1
Limited iteration: 2
Limited iteration: 3
Limited iteration: 4
Iteration =  0 , step =  (12.553509304455064, 4.3056275031837)
Iteration =  1 , step =  (1.8540022170034662, 1.4818921750831

KeyboardInterrupt: 

In [80]:
# Get results
print("Optimal variable", result.x, "gives brier of", result.fun)

Optimal variable 9.465461359819516 gives brier of 0.19587134446883256


In [9]:
a = atp.to_ttt_format(sample_games=50000)
h = ttt.History(composition = a[0], times = a[1])

Speedup branch


In [10]:
h.convergence(epsilon = 0.01, iterations = 10)

Iteration =  0 , step =  (14.945056624654391, 5.377917096145063)
Iteration =  1 , step =  (2.812648566510126, 0.9406496200515968)
Iteration =  2 , step =  (0.6736415422089026, 0.19307175884181205)
Iteration =  3 , step =  (0.16691655031202535, 0.050757047021259494)
Iteration =  4 , step =  (0.056196947812871656, 0.016280531610799898)
Iteration =  5 , step =  (0.02045164473815486, 0.005719036192219917)
Iteration =  6 , step =  (0.017492529891368847, 0.0033196865249522922)
Iteration =  7 , step =  (0.016703107565909647, 0.0030154666622732584)
Iteration =  8 , step =  (0.015375995091575945, 0.002739081857586978)
Iteration =  9 , step =  (0.014051774145483975, 0.0025132449104439125)
End


((0.014051774145483975, 0.0025132449104439125), 10)

In [28]:
dataset.matchups

array([[10617, 23969],
       [24225, 33677],
       [13428,  4026],
       ...,
       [34224, 36744],
       [30320, 16045],
       [18871, 34219]], dtype=int32)

P1,P2,Tour,Surface,Day,Date
i64,i64,i64,i8,i64,date
115,105,1,0,2568,1997-01-12
35,223,1,0,2558,1997-01-02
61,110,1,0,2558,1997-01-02
275,181,1,0,2558,1997-01-02
105,103,1,0,2558,1997-01-02
…,…,…,…,…,…
88766,44555,20203,0,12739,2024-11-17
50504,99846,20211,0,12738,2024-11-16
99982,84556,20211,0,12738,2024-11-16
95698,53207,20211,0,12738,2024-11-16


In [114]:
mu = [tp[1].mu for tp in h.learning_curves()[5992]]

In [138]:
players = [5992, 22807, 30470, 19, 47275]

[h.learning_curves()[player][-1] for player in players]

[(12704, N(mu=7.452, sigma=0.439)),
 (12736, N(mu=6.523, sigma=0.400)),
 (12723, N(mu=6.236, sigma=0.375)),
 (11510, N(mu=6.346, sigma=0.489)),
 (12739, N(mu=8.172, sigma=0.434))]

In [120]:
whr = whole_history_rating.Base()
whr.load_games(atp.to_whr_format())

In [135]:
whr.iterate(5)

In [50]:
import time
import numpy as np
from dataclasses import dataclass

# Current approach with Gaussian objects
class Gaussian:
    def __init__(self, mu, sigma):
        self.mu = mu
        self.sigma = sigma
    
    def __mul__(self, other):
        pi1, pi2 = self.sigma**-2, other.sigma**-2
        pi = pi1 + pi2
        tau = (self.mu * pi1 + other.mu * pi2) / pi
        sigma = np.sqrt(1/pi)
        return Gaussian(tau, sigma)

# New approach using just numbers
@dataclass
class FastMessage:
    mu: float
    sigma: float

def multiply_messages(m1: FastMessage, m2: FastMessage) -> FastMessage:
    pi1, pi2 = m1.sigma**-2, m2.sigma**-2
    pi = pi1 + pi2
    tau = (m1.mu * pi1 + m2.mu * pi2) / pi
    sigma = np.sqrt(1/pi)
    return FastMessage(tau, sigma)

def benchmark(n_messages=10000, n_iterations=5):
    """
    Benchmark comparing Gaussian objects vs direct numerical operations.
    
    Args:
        n_messages: Number of message updates to perform
        n_iterations: Number of iterations through the messages
    """
    # Generate random data
    np.random.seed(42)
    means = np.random.randn(n_messages)
    stds = np.abs(np.random.randn(n_messages)) + 1
    
    # Test Gaussian objects
    start = time.time()
    messages = [Gaussian(mu, sigma) for mu, sigma in zip(means, stds)]
    for _ in range(n_iterations):
        for i in range(1, len(messages)):
            messages[i] = messages[i] * messages[i-1]
    gaussian_time = time.time() - start
    
    # Test direct numerical
    start = time.time()
    fast_messages = [FastMessage(mu, sigma) for mu, sigma in zip(means, stds)]
    for _ in range(n_iterations):
        for i in range(1, len(fast_messages)):
            fast_messages[i] = multiply_messages(fast_messages[i], fast_messages[i-1])
    numerical_time = time.time() - start
    
    return {
        'gaussian_time': gaussian_time,
        'numerical_time': numerical_time,
        'speedup': gaussian_time / numerical_time
    }

# Run benchmarks with different sizes
sizes = [1000, 10000, 100000]
for size in sizes:
    print(f"\nBenchmark with {size} messages:")
    results = benchmark(n_messages=size)
    print(f"Gaussian objects: {results['gaussian_time']:.3f} seconds")
    print(f"Direct numerical: {results['numerical_time']:.3f} seconds")
    print(f"Speedup factor: {results['speedup']:.2f}x")


Benchmark with 1000 messages:
Gaussian objects: 0.024 seconds
Direct numerical: 0.008 seconds
Speedup factor: 2.99x

Benchmark with 10000 messages:
Gaussian objects: 0.097 seconds
Direct numerical: 0.152 seconds
Speedup factor: 0.64x

Benchmark with 100000 messages:
Gaussian objects: 0.957 seconds
Direct numerical: 1.899 seconds
Speedup factor: 0.50x
