# 2018 PGA Tour Strokes Gained Analysis - Performance from different distance ranges

Let's first read the data and modules:

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/rshot.TXT', sep = ';', low_memory=False)


Data cleaning:

In [3]:
#Column renaming and cleanup
data.columns = data.columns.str.replace(' ', '')
data.columns = data.columns.str.replace('#', 'Nr')
data.columns = data.columns.str.replace('.', '')
data['player_name'] = data['PlayerFirstName'] + " " + data['PlayerLastName']
data_cleaned = data.drop(['XCoordinate', 'YCoordinate', 'ZCoordinate', 'TourCode', 
                          'TournNr', 'CourseNr', 'PermanentTournamentNr', 'TourDescription', 
                          'Lie', 'Slope', 'Elevation', '1stPuttFlag', 
                          'Time', 'Date', 'Yardage', 'DistancefromEdge', 'DistancetoHoleaftertheShot', 
                          'DistancefromCenter', 'ToLocation(Scorer)', 'ToLocation(Enhanced)', 'HoleScore', 'ParValue', 'Left/Right',
                          'PlayerFirstName', 'PlayerLastName', 'IntheHoleFlag', 'AroundtheGreenFlag', 'RecoveryShot', 'NrofStrokes'], axis=1)


#Create dummy categories for strokes gained category
data_cleaned = pd.concat([data_cleaned,pd.get_dummies(data_cleaned['StrokesGainedCategory'] ) ], axis=1 )
#Remove original category
data_cleaned.drop('StrokesGainedCategory', axis=1, inplace=True)
# Remove pair tournament with invalid data
data_cleaned = data_cleaned[data_cleaned['TournamentName'] != 'Zurich Classic of New Orleans']
#Remove match play tournament
data_cleaned = data_cleaned[~data_cleaned['TournamentName'].str.contains('Match Play')]
#Remove shots from rounds without distance data
data_cleaned = data_cleaned[data_cleaned['DistancetoPin'] != 0]
#Remove penalty and drop shots
data_cleaned = data_cleaned[data_cleaned['ShotType(S/P/D)'] == 'S']
data_cleaned.drop('ShotType(S/P/D)', axis=1, inplace=True)
#Remove putting strokes
data_cleaned = data_cleaned[data_cleaned['Putting'] != 1]
#Remove strokes off the tee (excluding tee shots on par 3s and 3rd shots)
data_cleaned = data_cleaned[data_cleaned['Off the Tee'] != 1]
#Remove the stroke categories since we do not need them anymore
data_cleaned.drop(['Putting', 'Around the Green', 'Approach the Green', 'Off the Tee'], axis = 1, inplace=True )
data_cleaned['DtP'] = ( data_cleaned['DistancetoPin'] / 36 ).round(0) 
#Create new FromLocation category
data_cleaned['FromLocation'] = np.where( data_cleaned['FromLocation(Scorer)'].isnull(), data_cleaned['FromLocation(Enhanced)'], data_cleaned['FromLocation(Scorer)'])
#Remove the other columns
data_cleaned.drop(['FromLocation(Scorer)', 'FromLocation(Enhanced)'], axis=1, inplace=True)
print(data_cleaned.columns)

Index(['Year', 'PlayerNr', 'Round', 'TournamentName', 'CourseName', 'Hole',
       'Shot', 'Distance', 'DistancetoPin', 'StrokesGained/Baseline',
       'player_name', 'DtP', 'FromLocation'],
      dtype='object')


To categorize player performance in different categories, we must first define the distance ranges:
 - Define the distance tresholds. The last one will be everything longer than that distance
 - Give them a label
 - Categorize every shot into one of those labels

In [4]:
#Create categories
distances = [0, 20, 40, 80, 100, 120, 140, 160, 180, 200, 225, 250 ]
distance_categories = []

#Place every shot into one category based on distance
for i in range(0, len(distances) - 1):
    cat_name = str(distances[i]) + "-" + str(distances[i+1]) 
    data_cleaned[cat_name] = ( data_cleaned['DtP'] >= distances[i] ) & ( data_cleaned['DtP'] < distances[i+1] )
    distance_categories.append(cat_name)
#Remaining shots (longer than 250)
data_cleaned[ ">=" + str(distances[-1]) ] = data_cleaned['DtP'] >= distances[-1] 
distance_categories.append(">=" + str(distances[-1]))

#Create one column for category instead of multiple binary features
data_cleaned['dist_cat'] = data_cleaned[distance_categories].idxmax(axis=1)
#Drop old features
data_cleaned.drop(distance_categories, axis=1, inplace=True)
#For each tournament and round, take the average strokes gained per category to adjust for difficulty of course
new_baseline =  data_cleaned.groupby(['TournamentName', 'dist_cat', 'Round']).mean()[['StrokesGained/Baseline']].reset_index(level=0)
#Rename column
new_baseline['new_baseline'] = new_baseline['StrokesGained/Baseline']
new_baseline.drop('StrokesGained/Baseline', axis=1, inplace=True)
#Merge with shot data
data_cleaned_new = data_cleaned.merge(new_baseline, how='left', left_on=['TournamentName', 'Round', 'dist_cat'], right_on=['TournamentName', 'Round', 'dist_cat'])
#Create normalied strokes gained per shot
data_cleaned_new['adj_sg'] = data_cleaned_new['StrokesGained/Baseline'] - data_cleaned_new['new_baseline']

Now that we have the adjusted strokes gained for each shot, we can get the mean for each player and category for the whole season:

In [5]:
#Create new dataframe with each players mean strokes gained per category over the season

player_category_adj_sg = data_cleaned_new.groupby(['PlayerNr', 'player_name', 'dist_cat'])['adj_sg'].describe()

player_category_adj_sg = player_category_adj_sg[['mean', 'count']].rename(index = str, columns={"mean": "adjusted_strokes_gained", 'count': 'shot_count'})
player_category_adj_sg.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adjusted_strokes_gained,shot_count
PlayerNr,player_name,dist_cat,Unnamed: 3_level_1,Unnamed: 4_level_1
1098,Jay Don Blake,0-20,0.020693,18.0
1098,Jay Don Blake,100-120,0.108274,5.0
1098,Jay Don Blake,120-140,-0.072232,5.0
1098,Jay Don Blake,140-160,0.027422,13.0
1098,Jay Don Blake,160-180,0.018411,15.0


In [6]:
#Restructure the dataframe
player_category_adj_sg = player_category_adj_sg.unstack(level=-1).swaplevel(0,1,axis=1).sort_index(level=0, axis=1)
player_category_adj_sg.head()

Unnamed: 0_level_0,dist_cat,0-20,0-20,100-120,100-120,120-140,120-140,140-160,140-160,160-180,160-180,...,200-225,200-225,225-250,225-250,40-80,40-80,80-100,80-100,>=250,>=250
Unnamed: 0_level_1,Unnamed: 1_level_1,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count,...,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count,adjusted_strokes_gained,shot_count
PlayerNr,player_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
10213,Dicky Pride,0.056827,78.0,-0.140584,31.0,0.003598,44.0,0.022412,54.0,-0.047559,65.0,...,-0.076672,37.0,-0.12263,22.0,-0.050218,15.0,0.059879,15.0,-0.000228,48.0
10375,Sean McCarty,-0.074989,25.0,-0.096649,7.0,-0.198392,8.0,-0.040167,17.0,0.093993,15.0,...,0.152166,12.0,0.217961,6.0,0.622096,3.0,-0.147884,8.0,-0.011302,9.0
10423,Mike Weir,0.143719,34.0,0.169188,12.0,-0.034086,14.0,-0.207811,10.0,0.018647,10.0,...,0.041182,9.0,-0.498249,4.0,-0.208728,4.0,-0.225323,4.0,0.087567,11.0
10505,Shane Bertsch,-0.092201,15.0,0.183793,6.0,0.108558,11.0,-0.182011,17.0,-0.237614,13.0,...,0.061397,6.0,0.106468,4.0,0.057424,4.0,0.059194,4.0,-0.221622,7.0
10585,Scott McCarron,-0.067597,14.0,-0.267308,3.0,-0.707964,3.0,-0.065082,7.0,-0.051676,7.0,...,-0.038257,1.0,-0.013924,5.0,-0.174905,4.0,-0.082461,3.0,0.024695,4.0


In [10]:

#Players for plot
players = [
          'Tiger Woods', 
         # 'Rory McIlroy', 
        #  'Jordan Spieth', 
          'Justin Thomas', 
          'Henrik Stenson', 
          #'Brooks Koepka',
          #'Phil Mickelson',
          #'Keegan Bradley',
          'Dustin Johnson',
          #'Jason Day'
          'Bryson DeChambeau'
          ]

#For sorting the categories
def getSortValue(category):
    return distance_categories.index(category)






In [11]:
#Plotting
import plotly.plotly as py
import plotly
import plotly.graph_objs as go


plots = []
for player in players:
    playerdf = player_category_adj_sg[player_category_adj_sg.index.get_level_values('player_name') == player]
    playerdf = playerdf.transpose().unstack(level=1)
    playerdf.columns = ['adj_sg', 'shot_count']
    playerdf['cat_ind'] = pd.Series(playerdf.index.values).apply(getSortValue).values
    playerdf = playerdf.sort_values('cat_ind')

    pplot = go.Scatter (  
        x = playerdf.index.values, 
        y = playerdf['adj_sg'].round(3), 
        name = player,
        text =  "Nr of shots: " + playerdf['shot_count'].astype(int).astype(str),
    )
    plots.append(pplot)

max_player = pd.DataFrame( columns=['cat', 'player_name', 'adjusted_strokes_gained'])
for cat in distance_categories:
    max_df = player_category_adj_sg[cat][player_category_adj_sg[cat]['shot_count'] > 30]
    max_row = max_df.loc[[max_df['adjusted_strokes_gained'].idxmax(axis=0)]].reset_index()[['player_name', 'adjusted_strokes_gained']]
    max_row['cat'] = cat;
    max_player = max_player.append(max_row, ignore_index=True, sort=False)

plots.append( go.Scatter( x = max_player['cat'], y = max_player['adjusted_strokes_gained'].round(3), name = "Max", text = max_player['player_name']) )

min_player = pd.DataFrame( columns=['cat', 'player_name', 'adjusted_strokes_gained'])
for cat in distance_categories:
    min_df = player_category_adj_sg[cat][player_category_adj_sg[cat]['shot_count'] > 30]
    min_row = min_df.loc[[min_df['adjusted_strokes_gained'].idxmin(axis=0)]].reset_index()[['player_name', 'adjusted_strokes_gained']]
    min_row['cat'] = cat;
    min_player = min_player.append(min_row, ignore_index=True, sort=False)

plots.append( go.Scatter( x = min_player['cat'], y = min_player['adjusted_strokes_gained'].round(3), name = "Min", text = min_player['player_name'] ) )

layout = {"title": "Average strokes gained per shot in category"}
py.iplot({"data":plots, "layout": layout } )


## Results
We can see some interesting results from this graph:
 - Tiger Woods was the best player on tour between 80 and 100 yards last season.
 - Justin Thomas, who had a great season, was dominating for shorter approach shots but fell off in the longer categories.
 - Henrik Stenson, the Swedish Iceman (I might be biased), is an absolute monster >180 yards while weaker at shorter distances. His legendary 3-wood had another impressive season judging by the >250 metric. 
 
We can also have a look at the best and worst players in each category:

In [9]:
display(max_player)
display(min_player)

Unnamed: 0,cat,player_name,adjusted_strokes_gained
0,0-20,Wade Ormsby,0.165664
1,20-40,Steve Stricker,0.177454
2,40-80,Wesley Bryan,0.163028
3,80-100,Tiger Woods,0.14518
4,100-120,Justin Thomas,0.133142
5,120-140,John Peterson,0.113723
6,140-160,A.J. McInerney,0.117101
7,160-180,Joaquin Niemann,0.071355
8,180-200,Dustin Johnson,0.098001
9,200-225,Thomas Pieters,0.194123


Unnamed: 0,cat,player_name,adjusted_strokes_gained
0,0-20,Michael Block,-0.217881
1,20-40,Rick Lamb,-0.198772
2,40-80,Satoshi Kodaira,-0.223989
3,80-100,Lucas Glover,-0.169898
4,100-120,Geoff Ogilvy,-0.142645
5,120-140,Troy Matteson,-0.12191
6,140-160,Paul Dunne,-0.154384
7,160-180,Eric Axley,-0.179436
8,180-200,Steven Bowditch,-0.237942
9,200-225,Matt Jones,-0.128809
