In [2]:
import urllib.request
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import numpy as np
from numpy.linalg import svd

In [4]:
# probably due to website update
def formatURL(year):
    url = ""
    if(year < 2017):
        url = "https://www.golfchannel.com/tours/masters/" + str(year) + "/masters/"
    else:
        url = "https://www.golfchannel.com/tours/pga-tour/" + str(year) + "/masters-tournament/"
    return url

def getDfs(year):
    url = formatURL(year)
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    div =  soup.find("div", class_ = "full_leaderboard")
    tables = div.find_all('table')
    tbody = tables[0].find_all('tbody')

    A = []
    B = []
    C = []
    D = []
    E = []
    F = []
    G = []
    H = []
    I = []

    for idx,row in enumerate(tbody[0].find_all('tr')):
            cells = row.find_all('td')
            if len(cells) >= 11:
                rank = cells[1].find(text=True)
                player_name = cells[3].find('a').find(text=True)
                overall = cells[4].find(text=True)
                rnd = cells[6].find(text=True)
                r1 = cells[7].find(text=True)
                r2 = cells[8].find(text=True)
                r3 = cells[9].find(text=True)
                r4 = cells[10].find(text=True)
                tot = cells[11].find(text=True)
                A.append(rank)
                B.append(player_name)
                C.append(overall)
                D.append(rnd)
                E.append(r1)
                F.append(r2)
                G.append(r3)
                H.append(r4)
                I.append(tot)

    df = pd.DataFrame(A, columns=['RANK'])
    df['PLAYER']=B
    df['OVERALL']=C
    df['RND']=D
    df['R1']=E
    df['R2']=F
    df['R3']=G
    df['R4']=H
    df['TOT']=I

    return df

#players - array of player names
#df - df containing all score data
def getRankings(players, df):
    return df.loc[df['PLAYER'].isin(players)]

# we record all the pairs of players
# wins and losses are relative to player 1
def createDict(players):
    player_dict = dict()
    num_players = len(players)
    count = 0
    for i in range(num_players):
        for j in range(num_players):
            if j > i:
                player_dict[count] = {'wins': 0, 'losses': 0, 'player1': players[i], 'player2': players[j], 'player1_idx': i, 'player2_idx': j}
                count += 1
    return player_dict

# 0-1 Metric
def fillDict(players, dfs):
    player_dict = createDict(players)
    for df in dfs:
        for key, value in player_dict.items():
            # Assign wins per round: if he scores more per round, he gets a win, otherwise a loss
            rd1_p1 = df.loc[df['PLAYER'] == str(value['player1'])].R1
            rd1_p2 = df.loc[df['PLAYER'] == str(value['player2'])].R1
            rd2_p1 = df.loc[df['PLAYER'] == str(value['player1'])].R2
            rd2_p2 = df.loc[df['PLAYER'] == str(value['player2'])].R2
            
            rd3_p1 = df.loc[df['PLAYER'] == str(value['player1'])].R3
            rd3_p2 = df.loc[df['PLAYER'] == str(value['player2'])].R3
            rd4_p1 = df.loc[df['PLAYER'] == str(value['player1'])].R4
            rd4_p2 = df.loc[df['PLAYER'] == str(value['player2'])].R4

            for tup in [(rd1_p1, rd1_p2), (rd2_p1, rd2_p2), (rd3_p1, rd3_p2), (rd4_p1, rd4_p2)]:
                 if not tup[0].isnull().iloc[0] and not tup[1].isnull().iloc[0]:
                    if int(tup[0]) > int(tup[1]):
                        player_dict[key]['losses'] += 1
                    elif int(tup[0]) < int(tup[1]):
                        player_dict[key]['wins'] += 1
                    else:
                        player_dict[key]['wins'] += 0.5
                        player_dict[key]['losses'] += 0.5
    return player_dict

def buildVotingMatrix(player_dict, metric, players):
    num_pairs = len(player_dict)
    num_players = len(players)
    votingMatrix = np.zeros(shape=(num_players, num_players))
    for i in range(num_pairs):
        if(str(metric) == "0-1"):
            votingMatrix[player_dict[i]['player1_idx']][player_dict[i]['player2_idx']] = player_dict[i]['losses']
            votingMatrix[player_dict[i]['player2_idx']][player_dict[i]['player1_idx']] = player_dict[i]['wins']
        elif(str(metric) == "1-alpha"):
            votingMatrix[player_dict[i]['player1_idx']][player_dict[i]['player2_idx']] = 1.1 * player_dict[i]['losses'] + player_dict[i]['wins'] 
            votingMatrix[player_dict[i]['player2_idx']][player_dict[i]['player1_idx']] = 1.1 * player_dict[i]['wins'] + player_dict[i]['losses']

    return votingMatrix

def buildTransitionMatrix(votingMatrix):
    rows, cols = votingMatrix.shape
    P = np.zeros(shape=(rows,cols))
    for i in range(rows):
        rowsum = np.sum(votingMatrix[i, :])
        for j in range(cols):
            P[i,j] = votingMatrix[i,j] / rowsum

    return P

# borrowed from Michael
def nullspace(A, atol=1e-13, rtol=0):
    A = np.atleast_2d(A)
    u, s, vh = svd(A)
    tol = max(atol, rtol * s[0])
    nnz = (s >= tol).sum()
    ns = vh[nnz:].conj().T
    return ns

def calcScores(P):
    rows, cols = P.shape
    A = P.transpose() - np.identity(rows)
    return nullspace(A)

def rankPlayers(scores, players):
    idxs = list(range(len(players)))
    list_scores = list(scores)
    sorted_idxs = [x for _,x in sorted(zip(list_scores,idxs))]
    sorted_players = [players[i] for i in sorted_idxs]
    return sorted_players

def computeRankedPlayers(player_dict, metric, players):
    votingMatrix = buildVotingMatrix(player_dict, str(metric), players)
    P = buildTransitionMatrix(votingMatrix)
    scores = calcScores(P)
    ranked_players = rankPlayers(scores, players)
    return ranked_players

def getAllOfficial(year):
    url = "https://www.pgatour.com/stats/stat.186." + str(year) + ".html"
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    div =  soup.find("div", class_ = "details-table-wrap")
    tables = div.find_all('table')
    
    A = []
    B = []
    C = []
    
    for idx,row in enumerate(tables[0].find_all('tr')):
            cells = row.find_all('td')
            if len(cells) > 0:
                this_week = str(cells[0].find(text=True)).strip('\n')
                last_week = str(cells[1].find(text=True)).strip('\n')
                player_name = str(cells[2].find('a').find(text=True)).strip('\n')
                
                A.append(this_week)
                B.append(last_week)
                C.append(player_name)

    df = pd.DataFrame(A, columns=['THIS WEEK'])
    df['PLAYER']=C
    df['LAST WEEK']=B

    return df

def nameFormat(name):
    arr_name = name.replace(',', '').split()
    return str(arr_name[1] + " " + arr_name[0])

def getOfficial(year, players):
    df = getAllOfficial(year)
    players_formatted = map(nameFormat, players)
    return df.loc[df['PLAYER'].isin(players_formatted)]

def compareRankings(official, zero_one, one_alpha):
    combined = np.hstack((official, zero_one))
    all_rankings = np.hstack((combined, one_alpha))
    columns = np.array(['official','0-1', '1-alpha']).transpose()
    return pd.DataFrame(all_rankings, columns=columns)

def mse(official, calculated):
    df = pd.DataFrame(official, columns=['OFFICIAL'])
    df['CALCULATED'] = calculated
    df = df.reset_index()
    s = 0
    count = 0
    for i in range(len(official)):
        if official[i][0] != None:
            s += (i - df.loc[df['CALCULATED'] == str(official[i][0])].index[0])**2
            count += 1
    return s / count
            
# scores for the past 5 years
scores_2014 = getDfs(2014)
scores_2015 = getDfs(2015)
scores_2016 = getDfs(2016)
scores_2017 = getDfs(2017)
scores_2018 = getDfs(2018)

intersect_2014_2015_2016 = np.intersect1d(scores_2014.PLAYER, np.intersect1d(scores_2015.PLAYER, scores_2016.PLAYER))
intersect_2017 = np.intersect1d(scores_2017.PLAYER, scores_2018.PLAYER)

common_players = np.intersect1d(intersect_2014_2015_2016, intersect_2017)

rankings_2014 = getRankings(common_players, scores_2014).reset_index()
rankings_2015 = getRankings(common_players, scores_2015).reset_index()
rankings_2016 = getRankings(common_players, scores_2016).reset_index()
rankings_2017 = getRankings(common_players, scores_2017).reset_index()
rankings_2018 = getRankings(common_players, scores_2018).reset_index()

all_rankings = [rankings_2014, rankings_2015, rankings_2016, rankings_2017, rankings_2018]
player_dict = fillDict(common_players, all_rankings)

# 0-1 Metric
ranked_players_01 = list(map(nameFormat, computeRankedPlayers(player_dict, "0-1", common_players)))

# 1-alpha Metric
ranked_players_1alpha = list(map(nameFormat, computeRankedPlayers(player_dict, "1-alpha", common_players)))

# compared to this (YTD Official World Golf Ranking)
official_2018 = getOfficial(2018, common_players)

# Best to Worst
resize_01 = np.array(ranked_players_01).reshape(len(ranked_players_01), 1)
resize_1alpha = np.array(ranked_players_1alpha).reshape(len(ranked_players_1alpha), 1)
resize_official = np.full((len(ranked_players_1alpha), 1), None)
resize_official[:len(official_2018)] = np.array(official_2018['PLAYER']).reshape(len(official_2018), 1)

list_resize_official = list(resize_official)

mse_01 = mse(list_resize_official, ranked_players_01)
mse_1alpha = mse(list_resize_official, ranked_players_1alpha)

comparison_table = compareRankings(resize_official, resize_01, resize_1alpha)
print(comparison_table)
print(mse_01)
print(mse_1alpha)
# comparison_table.to_csv("rankings.csv")

            official               0-1           1-alpha
0      Jordan Spieth     Jordan Spieth     Jordan Spieth
1        Justin Rose       Justin Rose       Justin Rose
2      Rickie Fowler      Rory McIlroy      Rory McIlroy
3          Jason Day  Hideki Matsuyama  Louis Oosthuizen
4       Rory McIlroy     Rickie Fowler       Matt Kuchar
5   Hideki Matsuyama      Bubba Watson         Jason Day
6       Patrick Reed  Louis Oosthuizen        Adam Scott
7      Sergio Garcia         Jason Day      Jimmy Walker
8     Henrik Stenson       Matt Kuchar  Hideki Matsuyama
9       Bubba Watson    Henrik Stenson     Rickie Fowler
10    Phil Mickelson        Adam Scott      Bubba Watson
11       Matt Kuchar      Jimmy Walker    Henrik Stenson
12  Louis Oosthuizen    Phil Mickelson     Martin Kaymer
13     Branden Grace     Sergio Garcia    Phil Mickelson
14      Webb Simpson      Patrick Reed      Patrick Reed
15      Zach Johnson        Ryan Moore        Ryan Moore
16      Jason Dufner      Webb 