In [73]:
import pandas as pd;
import numpy as np;

In [111]:
data = pd.read_csv('../data/rshot.csv.TXT', delimiter=';', low_memory=False)

In [112]:
data.columns = data.columns.str.replace(' ', '')
data.columns = data.columns.str.replace('#', 'Nr')
data.columns = data.columns.str.replace('.', '')
data.columns

Index(['TourCode', 'TourDescription', 'Year', 'TournNr', 'PlayerNr',
       'CourseNr', 'PermanentTournamentNr', 'PlayerFirstName',
       'PlayerLastName', 'Round', 'TournamentName', 'CourseName', 'Hole',
       'HoleScore', 'ParValue', 'Yardage', 'Shot', 'ShotType(S/P/D)',
       'NrofStrokes', 'FromLocation(Scorer)', 'FromLocation(Enhanced)',
       'ToLocation(Scorer)', 'ToLocation(Enhanced)', 'Distance',
       'DistancetoPin', 'IntheHoleFlag', 'AroundtheGreenFlag', '1stPuttFlag',
       'DistancetoHoleaftertheShot', 'Time', 'Lie', 'Elevation', 'Slope',
       'XCoordinate', 'YCoordinate', 'ZCoordinate', 'DistancefromCenter',
       'DistancefromEdge', 'Date', 'Left/Right', 'StrokesGained/Baseline',
       'StrokesGainedCategory', 'RecoveryShot'],
      dtype='object')

In [134]:
data['FromLocation(Enhanced)'].unique()

array([nan, 'Right Rough', 'Unmapped', 'Left Fairway', 'Left Rough',
       'Right Intermediate', 'Left Intermediate', 'Right Fairway',
       'Right Front Green Side Bunker', 'Front Center Green Side Bunker',
       'Front Left Green Side Bunker', 'Left Green Side Bunker',
       'Right Green Side Bunker', 'Rear Green Side Bunker',
       'Right Rear Green Side Bunker', 'Left Rear Green Side Bunker'],
      dtype=object)

In [114]:
data['is_rough'] =  ~data['FromLocation(Scorer)'].isna() & data['FromLocation(Scorer)'].str.contains('Rough') 

In [115]:
data['is_fairway'] =  ~data['FromLocation(Enhanced)'].isna() & ( data['FromLocation(Enhanced)'].str.contains('Fairway') |  data['FromLocation(Enhanced)'].str.contains('Enhanced') ) & ~data.s

In [116]:
data = data[data.DistancetoPin > 0]

In [117]:
# Remove pair tournament with invalid data
data = data[data['TournamentName'] != 'Zurich Classic of New Orleans']
#Remove match play tournament
data = data[~data['TournamentName'].str.contains('Match Play')]

In [118]:
rough_shots = data[data['is_rough']]
fairway_shots = data[data['is_fairway']]

In [119]:
rough_sg = rough_shots.groupby('CourseName').describe()['StrokesGained/Baseline'][['mean', 'count', 'std']]

In [120]:
fairway_sg = fairway_shots.groupby('CourseName').describe()['StrokesGained/Baseline'][['mean', 'count', 'std']]

In [121]:
sg_mean = pd.concat([fairway_sg_mean, rough_sg_mean], axis=1)

In [122]:
sg_mean.columns = ['fairway_mean', 'fairway_count', 'fairway_std', 'rough_mean', 'rough_count', 'rough_std']

In [130]:
all_rough_mean = rough_shots['StrokesGained/Baseline'].mean()
all_fairway_mean = fairway_shots['StrokesGained/Baseline'].mean()
sg_mean['rough_mean'] = sg_mean['rough_mean'] - all_rough_mean
sg_mean['fairway_mean'] = sg_mean['fairway_mean'] - all_fairway_mean
sg_mean['diff'] = sg_mean['rough_mean'] - sg_mean['fairway_mean']

In [131]:
import scipy.stats as stats


In [132]:
def two_sample_ttest(row):
    statistic, p_val = stats.ttest_ind_from_stats(row['fairway_mean'],
                                                  row['fairway_std'],
                                                  row['fairway_count'],
                                                  row['rough_mean'],
                                                  row['rough_std'],
                                                  row['rough_count'],
                                                  equal_var = False)
    return p_val

sg_mean['p_val'] = sg_mean.apply(two_sample_ttest, axis=1)

In [133]:
alpha = 0.05
sg_mean['significant'] = sg_mean['p_val'] < alpha
sg_mean.sort_values('diff')

Unnamed: 0_level_0,fairway_mean,fairway_count,fairway_std,rough_mean,rough_count,rough_std,diff,p_val,significant
CourseName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
East Lake GC,0.066746,1155.0,0.331132,-0.059045,1042.0,0.338802,-0.125792,3.155329e-18,True
Muirfield Village GC,0.019369,5071.0,0.383785,-0.054591,2977.0,0.386643,-0.07396,1.190227e-16,True
Bellerive CC,0.046417,6053.0,0.320432,-0.027218,2590.0,0.360528,-0.073635,3.7256279999999995e-19,True
TPC Southwind,0.013335,4884.0,0.34712,-0.057463,4077.0,0.381274,-0.070797,9.702693e-20,True
TPC Potomac at Avenel Farm,0.044736,4194.0,0.358094,-0.015418,3132.0,0.359574,-0.060153,1.413183e-12,True
TPC Deere Run,0.032639,5967.0,0.358606,-0.026956,2575.0,0.354337,-0.059595,1.353315e-12,True
Sedgefield CC,0.047738,5103.0,0.331837,-0.006151,3117.0,0.349761,-0.053889,5.335104e-12,True
Bay Hill Club & Lodge,-0.016333,5734.0,0.352645,-0.065948,1893.0,0.370427,-0.049615,3.369506e-07,True
Ridgewood CC,0.020935,5019.0,0.326481,-0.017918,2859.0,0.338498,-0.038854,7.180647e-07,True
Firestone CC (South),0.025039,2667.0,0.353135,-0.013633,2979.0,0.36569,-0.038672,5.426539e-05,True


In [93]:
rough_shots['StrokesGained/Baseline'].mean()

-0.021018295390913013

In [94]:
fairway_shots['StrokesGained/Baseline'].mean()

0.016497696153139056