In [1]:
import requests as rq
import pandas as pd
import json
import re
from bs4 import BeautifulSoup as BS
from numpy import NaN
from tqdm import tqdm
from functools import reduce

In [2]:
def get_page(YEAR = 2024, TOURN = None, ROUND = 1):
    X_API_KEY = "da2-gsrx5bibzbb4njvhl7t37wqyl4"
    
    rounds = {1: '148', 2: '149', 3: "117", 4: '285'}

    if TOURN == None:
        event_query = None
    else:
        if str(YEAR) != TOURN[1:5]:
            return 'YEAR AND TOURNAMENT ID NOT MATCHED'
    
        event_query = {
            "queryType": "EVENT_ONLY",
            "tournamentId": TOURN
        }
    payload = {
      "operationName": "StatDetails",
      "variables": {
        "tourCode": "R",
        "statId": rounds[ROUND],
        "year": YEAR,
        "eventQuery": event_query
      },
      "query": "query StatDetails($tourCode: TourCode!, $statId: String!, $year: Int, $eventQuery: StatDetailEventQuery) {\n  statDetails(\n    tourCode: $tourCode\n    statId: $statId\n    year: $year\n    eventQuery: $eventQuery\n  ) {\n    __typename\n    tourCode\n    year\n    displaySeason\n    statId\n    statType\n    tournamentPills {\n      tournamentId\n      displayName\n    }\n    yearPills {\n      year\n      displaySeason\n    }\n    statTitle\n    statDescription\n    tourAvg\n    lastProcessed\n    statHeaders\n    statCategories {\n      category\n      displayName\n      subCategories {\n        displayName\n        stats {\n          statId\n          statTitle\n        }\n      }\n    }\n    rows {\n      ... on StatDetailsPlayer {\n        __typename\n        playerId\n        playerName\n        country\n        countryFlag\n        rank\n        rankDiff\n        rankChangeTendency\n        stats {\n          statName\n          statValue\n          color\n        }\n      }\n      ... on StatDetailTourAvg {\n        __typename\n        displayName\n        value\n      }\n    }\n    sponsorLogo\n  }\n}"
    }
    
    page = json.loads(
        rq.post("https://orchestrator.pgatour.com/graphql", json = payload, headers = {"x-api-key": X_API_KEY}).text
    )
    return page

In [3]:
def competitions(year1 = 2023, year2 = 2024):
    years = {}
    for year in range(year1, year2 + 1):
        years[year] = [
            comp for comp in get_page(YEAR = year)["data"]['statDetails']['tournamentPills']
        ]
    return years

In [4]:
def get_points_since(YEAR = 2023, ROUND = 1):
    years = competitions(YEAR, 2024)
    all = []
    for year in range(YEAR, 2025):
        tourns_year = []
        for tourn in range(0, len(years[year])):
            data = [i for i in get_page(year, years[year][tourn]['tournamentId'], ROUND)['data']['statDetails']['rows'] 
                    if len(i) > 3]
            for j in range(0, len(data)):
                    data[j].update([('year', year), ('tournament', years[year][tourn]['displayName'])])
                    data[j]['rank'] = j + 1
            tourns_year += data
        all += tourns_year
    table = pd.DataFrame(
        map(lambda item: {
            "RANK": item["rank"],
            "ROUND " + str(ROUND): item["stats"][0]["statValue"],
            "YEAR": item['year'],
            "TOURNAMENT": item['tournament']
        }, all))
    return table

In [5]:
def final_data(YEAR = 2023):
    rounds = []
    for rnd in tqdm(range(1, 5)):
        r = get_points_since(YEAR, rnd)
        rounds.append(r)
    df_merged = reduce(lambda  left, right: 
                       pd.merge(left, right,on=['RANK', 'YEAR', 'TOURNAMENT'],
                                                how = 'outer'), rounds)
    return df_merged

In [6]:
data = final_data(2000)

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [42:44<00:00, 641.03s/it]


In [7]:
data.to_csv('golf_data.csv')