In [19]:
import trueskill
import requests
import csv
import pandas as pd
from scipy import stats as sts

data_url = 'https://course-resources.minerva.kgi.edu/uploaded_files/mke/00090402-4649/tennis-data.csv'
text = requests.get(data_url).content.decode('latin-1').strip()
lines =	(line for line in text.split('\n'))

In [20]:
# Create a TrueSkill environment with 0 probability of a draw
env = trueskill.TrueSkill(draw_probability=0)

players = {}  # player name: player rating
first_game = {}  # player name: date of first game
last_game = {}  # player name: date of last game

count = 0
reader = csv.reader((line.decode('latin-1') for line in requests.get(data_url).iter_lines()))
header = next(reader)  # First line of CSV file is the header

In [21]:
for datum in (dict(zip(header, _)) for _ in reader):
    if datum == {}: continue  # Skip empty rows

    # It turns out names sometimes have trailing spaces, so strip those to avoid duplicates
    winner_name = datum['Winner'].strip()
    loser_name = datum['Loser'].strip()

    # Get or create ratings
    winner = players.get(winner_name) or env.create_rating()
    loser = players.get(loser_name) or env.create_rating()

    # Update ratings, and first and last played dates
    players[winner_name], players[loser_name] = env.rate_1vs1(winner, loser)
    first_game.setdefault(winner_name, datum['Date'])
    first_game.setdefault(loser_name, datum['Date'])
    last_game[winner_name] = last_game[loser_name] = datum['Date']

    # Display progress through data file
    count += 1
    if count % 5000 == 0:
        print(count, 'games processed')
print('done')

5000 games processed
10000 games processed
15000 games processed
20000 games processed
25000 games processed
30000 games processed
35000 games processed
40000 games processed
45000 games processed
done


In [22]:
# Top 20 players and their skills
leaderboard = sorted(players.items(), key=lambda player: env.expose(player[1]), reverse=True)
for i in range(50):
    player = leaderboard[i]
    print('%2i. %-20s: %.2f ± %.2f [%10s - %10s]' % (
        i+1, player[0], player[1].mu, player[1].sigma,
        first_game[player[0]], last_game[player[0]]))

 1. Djokovic N.         : 41.89 ± 0.97 [20/07/2004 - 11/09/2016]
 2. Murray A.           : 38.65 ± 0.89 [19/04/2005 -  7/09/2016]
 3. Federer R.          : 38.61 ± 0.89 [ 3/01/2000 -  8/07/2016]
 4. Nadal R.            : 37.05 ± 0.88 [15/04/2003 -  4/09/2016]
 5. Nishikori K.        : 35.91 ± 0.85 [17/07/2007 -  9/09/2016]
 6. Wawrinka S.         : 35.62 ± 0.84 [ 7/07/2003 - 25/09/2016]
 7. Soderling R.        : 35.43 ± 0.84 [22/10/2001 - 17/07/2011]
 8. Del Potro J.M.      : 35.34 ± 0.84 [30/01/2006 -  8/09/2016]
 9. Raonic M.           : 35.24 ± 0.84 [12/08/2009 - 22/09/2016]
10. Berdych T.          : 35.21 ± 0.85 [25/08/2003 - 24/09/2016]
11. Agassi A.           : 35.09 ± 0.85 [17/01/2000 -  3/09/2006]
12. Sampras P.          : 34.81 ± 0.89 [17/01/2000 - 26/08/2002]
13. Tsonga J.W.         : 34.62 ± 0.84 [15/09/2004 -  7/09/2016]
14. Rafter P.           : 34.92 ± 0.95 [28/02/2000 - 12/11/2001]
15. Roddick A.          : 34.16 ± 0.84 [28/02/2000 -  5/09/2012]
16. Ferrer D.           :

In [23]:
def calculate_win(mu1, mu2, sd1, sd2):
    skill1 = sts.norm(mu1,sd1).rvs(size=5000)
    skill2 = sts.norm(mu2,sd2).rvs(size=5000)
    
    perf1 = [sts.norm(i, env.beta).rvs(size=1) for i in skill1]
    perf2 = [sts.norm(i, env.beta).rvs(size=1) for i in skill2]
    
    count = 0
    for j in range(len(perf1)):
        if perf1[j] > perf2[j]:
            count += 1
    proportion = count/len(perf1)
    return print("Player 1 will win with probability:",proportion)

djokovic=(41.89,0.97)
wawrinka=(35.62,0.84)
n=(35.91,0.85)

calculate_win(35.91, 35.62, 0.85, 0.84)

Player 1 will win with probability: 0.5264
