In [1]:
import urllib.request
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import numpy as np
from numpy.linalg import svd
import math

In [2]:
# formats URL for scraping
def formatURL(year):
    url = ""
    if(year < 2017):
        url = "https://www.golfchannel.com/tours/masters/" + str(year) + "/masters/"
    else:
        url = "https://www.golfchannel.com/tours/pga-tour/" + str(year) + "/masters-tournament/"
    return url

In [3]:
# calculates average of a player's rounds
def calcAvg(r1, r2, r3, r4):
    if r3 == "None":
        if r1 == "None" or r2 == "None":
            return 0.0
        return (int(r1) + int(r2))/2.0
    else:
        return (int(r1) + int(r2) + int(r3) + int(r4))/4.0

In [4]:
# scrapes 2014-2018 golf rankings
def getDfs(year):
    url = formatURL(year)
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    div =  soup.find("div", class_ = "full_leaderboard")
    tables = div.find_all('table')
    tbody = tables[0].find_all('tbody')

    A = []
    B = []
    C = []
    D = []
    E = []
    F = []
    G = []
    H = []
    I = []
    J = []

    for idx,row in enumerate(tbody[0].find_all('tr')):
            cells = row.find_all('td')
            if len(cells) >= 11:
                rank = cells[1].find(text=True)
                player_name = cells[3].find('a').find(text=True)
                overall = cells[4].find(text=True)
                rnd = cells[6].find(text=True)
                r1 = cells[7].find(text=True)
                r2 = cells[8].find(text=True)
                r3 = cells[9].find(text=True)
                r4 = cells[10].find(text=True)
                tot = cells[11].find(text=True)
                avg = calcAvg(str(r1), str(r2), str(r3), str(r4))
                A.append(rank)
                B.append(player_name)
                C.append(overall)
                D.append(rnd)
                E.append(r1)
                F.append(r2)
                G.append(r3)
                H.append(r4)
                I.append(tot)
                J.append(avg)

    df = pd.DataFrame(A, columns=['RANK'])
    df['PLAYER']=B
    df['OVERALL']=C
    df['RND']=D
    df['R1']=E
    df['R2']=F
    df['R3']=G
    df['R4']=H
    df['TOT']=I
    df['AVG']=J

    return df

In [5]:
# get rankings only for players we are interested in
def getRankings(players, df):
    return df.loc[df['PLAYER'].isin(players)]

In [6]:
# generates ranking for players based on round number
def generateRanking(df):
    round1_sorted = df.sort_values('R1').reset_index()
    round2_sorted = df.sort_values('R2').reset_index()
    round3_sorted = df.sort_values('R3').reset_index()
    round4_sorted = df.sort_values('R4').reset_index()
    return (round1_sorted, round2_sorted, round3_sorted, round4_sorted)

In [7]:
# generates ranking for players based on average score of all rounds
def generateTotalRanking(df):
    avg_sorted = df.sort_values('AVG').reset_index()
    return avg_sorted

In [8]:
# extracts round scores and player ranks based on per-round score
def getRoundScores(df, player1, player2, rounds_sorted):
    rd1_p1 = df.loc[df['PLAYER'] == str(player1)].R1
    rd1_p2 = df.loc[df['PLAYER'] == str(player2)].R1
    rd2_p1 = df.loc[df['PLAYER'] == str(player1)].R2
    rd2_p2 = df.loc[df['PLAYER'] == str(player2)].R2

    rd3_p1 = df.loc[df['PLAYER'] == str(player1)].R3
    rd3_p2 = df.loc[df['PLAYER'] == str(player2)].R3
    rd4_p1 = df.loc[df['PLAYER'] == str(player1)].R4
    rd4_p2 = df.loc[df['PLAYER'] == str(player2)].R4
    
    rd1_p1_idx = rounds_sorted[0].loc[rounds_sorted[0]['PLAYER'] == str(player1)].index[0]
    rd1_p2_idx = rounds_sorted[0].loc[rounds_sorted[0]['PLAYER'] == str(player2)].index[0]
    rd2_p1_idx = rounds_sorted[1].loc[rounds_sorted[1]['PLAYER'] == str(player1)].index[0]
    rd2_p2_idx = rounds_sorted[1].loc[rounds_sorted[1]['PLAYER'] == str(player2)].index[0]
    rd3_p1_idx = rounds_sorted[2].loc[rounds_sorted[2]['PLAYER'] == str(player1)].index[0]
    rd3_p2_idx = rounds_sorted[2].loc[rounds_sorted[2]['PLAYER'] == str(player2)].index[0]
    rd4_p1_idx = rounds_sorted[3].loc[rounds_sorted[3]['PLAYER'] == str(player1)].index[0]
    rd4_p2_idx = rounds_sorted[3].loc[rounds_sorted[3]['PLAYER'] == str(player2)].index[0]
    
    return [(rd1_p1, rd1_p2, rd1_p1_idx, rd1_p2_idx), (rd2_p1, rd2_p2, rd2_p1_idx, rd2_p2_idx), (rd3_p1, rd3_p2, rd3_p1_idx, rd3_p2_idx), (rd4_p1, rd4_p2, rd4_p1_idx, rd4_p2_idx)]

In [9]:
# extracts round scores and player ranks based on average score of all rounds 
def getAvgRoundScores(df, player1, player2, avg_sorted):
    rd1_p1 = df.loc[df['PLAYER'] == str(player1)].R1
    rd1_p2 = df.loc[df['PLAYER'] == str(player2)].R1
    rd2_p1 = df.loc[df['PLAYER'] == str(player1)].R2
    rd2_p2 = df.loc[df['PLAYER'] == str(player2)].R2

    rd3_p1 = df.loc[df['PLAYER'] == str(player1)].R3
    rd3_p2 = df.loc[df['PLAYER'] == str(player2)].R3
    rd4_p1 = df.loc[df['PLAYER'] == str(player1)].R4
    rd4_p2 = df.loc[df['PLAYER'] == str(player2)].R4
    
    p1_idx = avg_sorted.loc[avg_sorted['PLAYER'] == str(player1)].index[0]
    p2_idx = avg_sorted.loc[avg_sorted['PLAYER'] == str(player2)].index[0]
    
    return [(rd1_p1, rd1_p2, p1_idx, p2_idx), (rd2_p1, rd2_p2, p1_idx, p2_idx), (rd3_p1, rd3_p2, p1_idx, p2_idx), (rd4_p1, rd4_p2, p1_idx, p2_idx)]

In [10]:
#0-1 metric with option for by-year discounting
def zero_one(votingMatrix, score_1, score_2, idx_1, idx_2, multiplier):
    if score_1 > score_2:
        votingMatrix[idx_1][idx_2] += 1*multiplier
    elif score_1 < score_2:
        votingMatrix[idx_2][idx_1] += 1*multiplier
    else:
        votingMatrix[idx_2][idx_1] += 0.5*multiplier
        votingMatrix[idx_1][idx_2] += 0.5*multiplier
    return votingMatrix

In [11]:
#1-alpha metric with option for by-year discounting
def one_alpha(votingMatrix, score_1, score_2, idx_1, idx_2, multiplier):
    if score_1 > score_2:
        votingMatrix[idx_1][idx_2] += 2*multiplier
        votingMatrix[idx_2][idx_1] += 1*multiplier
    elif score_1 < score_2:
        votingMatrix[idx_2][idx_1] += 2*multiplier
        votingMatrix[idx_1][idx_2] += 1*multiplier
    else:
        votingMatrix[idx_2][idx_1] += 0.5*multiplier
        votingMatrix[idx_1][idx_2] += 0.5*multiplier
    return votingMatrix

In [12]:
#discount scores metric
def discount_scores(votingMatrix, score_1, score_2, idx_1, idx_2, multiplier):
    if score_1 > score_2:
        votingMatrix[idx_1][idx_2] += math.exp(-1*score_1/score_2)*multiplier
    elif score_1 < score_2:
        votingMatrix[idx_2][idx_1] += math.exp(-1*score_2/score_1)*multiplier
    else:
        votingMatrix[idx_2][idx_1] += 0.5*math.exp(-1)*multiplier
        votingMatrix[idx_1][idx_2] += 0.5*math.exp(-1)*multiplier
    return votingMatrix

In [13]:
#discount ranks metric
def discount_ranks(votingMatrix, score_1, score_2, idx_1, idx_2, rank_1, rank_2, multiplier):
    if score_1 > score_2:
        votingMatrix[idx_1][idx_2] += math.exp(1*((rank_2+1)/(rank_1 + 1)))*multiplier
    elif score_1 < score_2:
        votingMatrix[idx_2][idx_1] += math.exp(1*((rank_1+1)/(rank_2 + 1)))*multiplier
    else:
        votingMatrix[idx_2][idx_1] += 0.5*math.exp(-1)*multiplier
        votingMatrix[idx_1][idx_2] += 0.5*math.exp(-1)*multiplier
    return votingMatrix

In [14]:
# updates Voting Matrix based on by-year discounting and metric
def updateVotingMatrix(votingMatrix, metric, score_1, score_2, idx_1, idx_2, rank_1, rank_2, multiplier):
    if str(metric) == "0-1":
        votingMatrix = zero_one(votingMatrix, score_1, score_2, idx_1, idx_2, multiplier)
        return votingMatrix
    elif str(metric) == "1-alpha":
        votingMatrix = one_alpha(votingMatrix, score_1, score_2, idx_1, idx_2, multiplier)
        return votingMatrix
    elif str(metric) == "discount-scores":
        votingMatrix = discount_scores(votingMatrix, score_1, score_2, idx_1, idx_2, multiplier)
        return votingMatrix
    elif str(metric) == "discount-ranks":
        votingMatrix = discount_ranks(votingMatrix, score_1, score_2, idx_1, idx_2, rank_1, rank_2, multiplier)
        return votingMatrix

In [15]:
# constructs Voting Matrix with option of by-year discounting and ranking by average
def buildVotingMatrix(players, dfs, metric, isYearDiscounted, isAvgRanked):
    num_players = len(players)
    votingMatrix = np.zeros(shape=(num_players, num_players))
    for idx, df in enumerate(dfs):
        multiplier = 1
        if isYearDiscounted:
            multiplier = math.exp(-(len(dfs)-1-idx))
        if isAvgRanked:
            avg_sorted = generateTotalRanking(df)
        else:
            rounds_sorted = generateRanking(df)
        for i in range(num_players):
            for j in range(i, num_players):
                if isAvgRanked:
                    tups = getAvgRoundScores(df, players[i], players[j], avg_sorted)
                else:
                    tups = getRoundScores(df, players[i], players[j], rounds_sorted)
                for tup in tups:
                    tup0_isnull = tup[0].isnull().iloc[0]
                    tup1_isnull = tup[1].isnull().iloc[0]
                    if not tup0_isnull:
                        if not tup1_isnull:
                            score_1, score_2, rank_1, rank_2 = tup
                            idx_1, idx_2 = i, j
                            votingMatrix = updateVotingMatrix(votingMatrix, metric, int(score_1), int(score_2), idx_1, idx_2, rank_1, rank_2, multiplier)
    return votingMatrix

In [16]:
# builds Transition Probability Matrix
def buildTransitionMatrix(votingMatrix):
    rows, cols = votingMatrix.shape
    P = np.zeros(shape=(rows,cols))
    for i in range(rows):
        rowsum = np.sum(votingMatrix[i, :])
        for j in range(cols):
            P[i,j] = votingMatrix[i,j] / rowsum

    return P

In [17]:
# calculates Null Space
def nullspace(A, atol=1e-13, rtol=0):
    A = np.atleast_2d(A)
    u, s, vh = svd(A)
    tol = max(atol, rtol * s[0])
    nnz = (s >= tol).sum()
    ns = vh[nnz:].conj().T
    return ns

In [18]:
# calculates ranking based on Transition Probability Matrix
def calcScores(P):
    rows, cols = P.shape
    A = P.transpose() - np.identity(rows)
    return nullspace(A)

In [19]:
# retrieves corresponding player name from ranking
def rankPlayers(scores, players):
    idxs = list(range(len(players)))
    list_scores = list(scores)
    sorted_idxs = [x for _,x in sorted(zip(list_scores,idxs))]
    sorted_players = [players[i] for i in sorted_idxs]
    return sorted_players

In [20]:
# combines previous methods; builds Voting Matrix, Transition Probability Matrix, and gets player names
def computeRanks(players, dfs, metric, discountYear=False , avgRanked=False):
    votingMatrix = buildVotingMatrix(players, dfs, str(metric), discountYear, avgRanked)
    P = buildTransitionMatrix(votingMatrix)
    scores = calcScores(P)
    ranked_players = rankPlayers(scores, players)
    return ranked_players

In [21]:
# scrapes official (Y-T-D) golf rankings
def getAllOfficial(year):
    url = "https://www.pgatour.com/stats/stat.186." + str(year) + ".html"
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    div =  soup.find("div", class_ = "details-table-wrap")
    tables = div.find_all('table')
    
    A = []
    B = []
    C = []
    
    for idx,row in enumerate(tables[0].find_all('tr')):
            cells = row.find_all('td')
            if len(cells) > 0:
                this_week = str(cells[0].find(text=True)).strip('\n')
                last_week = str(cells[1].find(text=True)).strip('\n')
                player_name = str(cells[2].find('a').find(text=True)).strip('\n')
                
                A.append(this_week)
                B.append(last_week)
                C.append(player_name)

    df = pd.DataFrame(A, columns=['THIS WEEK'])
    df['PLAYER']=C
    df['LAST WEEK']=B

    return df

In [22]:
# formats player name to [first name] [last name]
def nameFormat(name):
    arr_name = name.replace(',', '').split()
    return str(arr_name[1] + " " + arr_name[0])

In [23]:
# formats player name to [last name], [first name]
def reverseNameFormat(name):
    arr_name = name.split()
    return str(arr_name[1] + ", " + arr_name[0] + " ")

In [24]:
# gets only players we are interested in from Official Rankings
def getOfficial(year, players):
    df = getAllOfficial(year)
    players_formatted = map(nameFormat, players)
    return df.loc[df['PLAYER'].isin(players_formatted)]

In [25]:
# calculates Mean-Squared-Error between the Official Rankings and our generated Rankings
def mse(official, calculated):
    df = pd.DataFrame(official, columns=['OFFICIAL'])
    df['CALCULATED'] = calculated
    df = df.reset_index()
    s = 0
    count = 0
    for i in range(len(official)):
        if official[i] != None:
            s += (i - df.loc[df['CALCULATED'] == str(official[i])].index[0])**2
            count += 1
    return s / count

In [26]:
# prints Rankings in a Dataframe 
def printDataFrame(rankings, name):
    columns = np.array([str(name)]).transpose()
    return pd.DataFrame(rankings, columns=columns)

In [27]:
# removes players that are not listed in official rankings
def createOfficialPlayersList(players, official):
    formatted_players = list(map(nameFormat, players))
    toReturn = []
    for i in range(len(formatted_players)):
        if len(official.loc[official['PLAYER'] == str(formatted_players[i])]):
            toReturn.append(formatted_players[i])
    return toReturn

In [28]:
# gets formatted rankings
def getFormattedRankings(players, all_rankings, metric, isYearDiscount):
    return list(map(nameFormat, computeRanks(players, all_rankings, str(metric), discountYear=isYearDiscount)))

In [29]:
# prints MSE 
def printMSE(metric, players, all_rankings, official, isYTD):
    if not isYTD and str(metric) == "discount-ranks":
        player_rankings = getFormattedRankings(players, all_rankings, metric, True)
    else:
        player_rankings = getFormattedRankings(players, all_rankings, metric, False)
    if isYTD and (str(metric) == "1-alpha" or str(metric) == "discount-scores"):
        # has to do with how Null Space is computed
        player_rankings.reverse()
    if isYTD:
        return "MSE for " + str(metric) + " with YTD rankings: " + str(mse(official, player_rankings)), player_rankings
    else:
        return "MSE for " + str(metric) + " with 2018 rankings: " + str(mse(official, player_rankings)), player_rankings

In [30]:
# scores for the past 5 years
scores_2014 = getDfs(2014)
scores_2015 = getDfs(2015)
scores_2016 = getDfs(2016)
scores_2017 = getDfs(2017)
scores_2018 = getDfs(2018)

In [31]:
# intersects player lists from different years
intersect_2014_2015_2016 = np.intersect1d(scores_2014.PLAYER, np.intersect1d(scores_2015.PLAYER, scores_2016.PLAYER))
intersect_2017_2018 = np.intersect1d(scores_2017.PLAYER, scores_2018.PLAYER)
players = np.intersect1d(intersect_2014_2015_2016, intersect_2017_2018)

# compared this to (YTD Official World Golf Ranking)
official_2018 = getOfficial(2018, players)

official_players = createOfficialPlayersList(players, official_2018)
official_players2 = list(map(reverseNameFormat, official_players))

rankings_2014 = getRankings(players, scores_2014).reset_index()
rankings_2015 = getRankings(players, scores_2015).reset_index()
rankings_2016 = getRankings(players, scores_2016).reset_index()
rankings_2017 = getRankings(players, scores_2017).reset_index()
rankings_2018 = getRankings(players, scores_2018).reset_index()

rankings_2014_official = getRankings(official_players2, scores_2014).reset_index()
rankings_2015_official = getRankings(official_players2, scores_2015).reset_index()
rankings_2016_official = getRankings(official_players2, scores_2016).reset_index()
rankings_2017_official = getRankings(official_players2, scores_2017).reset_index()
rankings_2018_official = getRankings(official_players2, scores_2018).reset_index()

all_rankings = [rankings_2014, rankings_2015, rankings_2016, rankings_2017, rankings_2018]
all_rankings_official = [rankings_2014_official, rankings_2015_official, rankings_2016_official, rankings_2017_official, rankings_2018_official]

In [32]:
# 0-1 Metric

zero_one_ytd, rankings_01_ytd = printMSE("0-1", official_players2, all_rankings_official, list(official_2018['PLAYER']), True)
zero_one_2018, rankings_01_2018 = printMSE("0-1", players, all_rankings, list(map(nameFormat, list(rankings_2018["PLAYER"]))), False)

print(zero_one_ytd)
print(zero_one_2018)
df_01_ytd = printDataFrame(rankings_01_ytd, "0-1 vs. Official")
df_01_2018 = printDataFrame(rankings_01_2018, "0-1 vs. 2018")

MSE for 0-1 with YTD rankings: 20.8695652174
MSE for 0-1 with 2018 rankings: 30.0


In [33]:
# 1-alpha Metric

one_alpha_ytd, rankings_1alpha_ytd = printMSE("1-alpha", official_players2, all_rankings_official, list(official_2018['PLAYER']), True)
one_alpha_2018, rankings_1alpha_2018 = printMSE("1-alpha", players, all_rankings, list(map(nameFormat, list(rankings_2018["PLAYER"]))), False)

print(one_alpha_ytd)
print(one_alpha_2018)
df_1alpha_ytd = printDataFrame(rankings_1alpha_ytd, "1-alpha vs. Official")
df_1alpha_2018 = printDataFrame(rankings_1alpha_2018, "1-alpha vs. 2018")

MSE for 1-alpha with YTD rankings: 32.0869565217
MSE for 1-alpha with 2018 rankings: 38.0


In [34]:
# discount scores Metric

discount_scores_ytd, rankings_discount_scores_ytd = printMSE("discount-scores", official_players2, all_rankings_official, list(official_2018['PLAYER']), True)
discount_scores_2018, rankings_discount_scores_2018 = printMSE("discount-scores", players, all_rankings, list(map(nameFormat, list(rankings_2018["PLAYER"]))), False)

print(discount_scores_ytd)
print(discount_scores_2018)
df_discount_scores_ytd = printDataFrame(rankings_discount_scores_ytd, "discount-scores vs. Official")
df_discount_scores_2018 = printDataFrame(rankings_discount_scores_2018, "discount-scores vs. 2018")

MSE for discount-scores with YTD rankings: 20.6086956522
MSE for discount-scores with 2018 rankings: 30.0


In [35]:
# discount ranks Metric

discount_ranks_ytd, rankings_discount_ranks_ytd = printMSE("discount-ranks", official_players2, all_rankings_official, list(official_2018['PLAYER']), True)
discount_ranks_2018, rankings_discount_ranks_2018 = printMSE("discount-ranks", players, all_rankings, list(map(nameFormat, list(rankings_2018["PLAYER"]))), False)

print(discount_ranks_ytd)
print(discount_ranks_2018)
df_discount_ranks_ytd = printDataFrame(rankings_discount_ranks_ytd, "discount-ranks vs. Official")
df_discount_ranks_2018 = printDataFrame(rankings_discount_ranks_2018, "discount-ranks vs. 2018")

MSE for discount-ranks with YTD rankings: 25.4782608696
MSE for discount-ranks with 2018 rankings: 10.4516129032
