## Project 1
### PGA Tour Golf Stats

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import os

In [2]:
# Declare input file paths and file names
pga_data_hist_file = os.path.join(".", "input_data", "PGA_Data_Historical.csv")


In [3]:
# Load data (non-wide version) into dataframe
pga_data_h_df = pd.read_csv(pga_data_hist_file)


In [4]:
# Get descriptive info on loaded dataframe
pga_data_h_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740403 entries, 0 to 2740402
Data columns (total 5 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   Player Name  object
 1   Season       int64 
 2   Statistic    object
 3   Variable     object
 4   Value        object
dtypes: int64(1), object(4)
memory usage: 104.5+ MB


In [5]:
pga_data_h_df.head()

Unnamed: 0,Player Name,Season,Statistic,Variable,Value
0,Robert Garrigus,2010,Driving Distance,Driving Distance - (ROUNDS),71
1,Bubba Watson,2010,Driving Distance,Driving Distance - (ROUNDS),77
2,Dustin Johnson,2010,Driving Distance,Driving Distance - (ROUNDS),83
3,Brett Wetterich,2010,Driving Distance,Driving Distance - (ROUNDS),54
4,J.B. Holmes,2010,Driving Distance,Driving Distance - (ROUNDS),100


In [6]:
# Get list of seasons available
seasons_lst = list(pga_data_h_df["Season"].unique())
seasons_lst

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

In [7]:
# Determine number of players available per season, then find players who played in all of the seasons in the dataset

# Player counts per season
pga_group1 = pga_data_h_df.copy()
pga_group1 = pga_group1[["Season", "Player Name"]]
pga_group1.drop_duplicates(inplace=True)
player_count = pga_group1.groupby("Season").count()
#print("Number of players per season:")
#player_count

# Find Players and how many seasons they played in

# Rearrange columns
pga_group1 = pga_group1[["Player Name", "Season"]]
player_grp = pga_group1.groupby("Player Name").count()
player_grp.reset_index(inplace=True)

played_all_seasons_name = player_grp.loc[player_grp["Season"] == len(seasons_lst),:]
played_all_seasons_name.reset_index(inplace=True, drop=True)

#played_all_seasons_name.to_csv("output_data/PlayersAcrossSeasons.csv")
played_all_seasons_name


Unnamed: 0,Player Name,Season
0,Aaron Baddeley,9
1,Adam Bland,9
2,Adam Hadwin,9
3,Adam Scott,9
4,Adilson da Silva,9
...,...,...
343,Y.E. Yang,9
344,Yoshinori Fujimoto,9
345,Yusaku Miyazato,9
346,Yuta Ikeda,9


In [8]:
# Determine number of general stats available per season, then find general stats available in all of the seasons in the dataset

gen_stats = pga_data_h_df.copy()
gen_stats = gen_stats[["Season", "Statistic"]]
gen_stats.drop_duplicates(inplace=True)
stats_count = gen_stats.groupby("Season").count()
#print("General stats available by Season:")
#stats_count

# Get general stats available across all seasons

# Rearrange column
stat_group1 = gen_stats[["Statistic", "Season"]]
stat_grp = stat_group1.groupby("Statistic").count()
stat_grp.reset_index(inplace=True)

stat_all_seasons_name = stat_grp.loc[stat_grp["Season"]==9,:]
stat_all_seasons_name.reset_index(inplace=True, drop=True)

#stat_all_seasons_name.to_csv("output_data/StatsAcrossSeasons.csv")
stat_all_seasons_name


Unnamed: 0,Statistic,Season
0,% of Potential Pts won - FedExCup Playoffs,9
1,% of Potential Pts won - FedExCup Regular Season,9
2,1-Putts per Round,9
3,11-25 Final Round Performance,9
4,2-Putts per Round,9
...,...,...
358,Total Eagles,9
359,Total Hole Outs,9
360,Total Money (Official and Unofficial),9
361,Total Putting,9


In [9]:
# Determine number of stat variables available per season, then find stat variables available in all of the seasons in the dataset

stat_vars = pga_data_h_df.copy()
stat_vars = stat_vars[["Season", "Variable"]]
stat_vars.drop_duplicates(inplace=True)
stat_vars_count = stat_vars.groupby("Season").count()
#print("Stat variables available by Season:")
#stat_vars_count

# Get stat variables available across all seasons

# Rearrange column
stat_var_group1 = stat_vars[["Variable", "Season"]]
stat_var_grp = stat_var_group1.groupby("Variable").count()
stat_var_grp.reset_index(inplace=True)

stat_vars_all_seasons_name = stat_var_grp.loc[stat_var_grp["Season"]==9,:]
stat_vars_all_seasons_name.reset_index(inplace=True, drop=True)

#stat_vars_all_seasons_name.to_csv("output_data/StatVarsAcrossSeasons.csv")
stat_vars_all_seasons_name


Unnamed: 0,Variable,Season
0,% of Potential Pts won - FedExCup Playoffs - (...,9
1,% of Potential Pts won - FedExCup Playoffs - (...,9
2,% of Potential Pts won - FedExCup Playoffs - (%),9
3,% of Potential Pts won - FedExCup Playoffs - (...,9
4,% of Potential Pts won - FedExCup Playoffs - (...,9
...,...,...
1453,Total Money (Official and Unofficial) - (MONEY),9
1454,Total Putting - (EVENTS),9
1455,Total Putting - (TOTAL),9
1456,Victory Leaders - (EVENTS),9


In [10]:
# Data cleaning - filter down to just the general stats category we are interested in
#               - filter down to players who played every season in our data
desired_stats_cats = ["Total Money (Official and Unofficial)",
                      "Driving Distance",
                      "Smash Factor",
                      "Total Driving",
                      "Putting Average",
                      "Total Putting",
                      "Proximity to Hole (ARG)",
                      "Short Game Rating"]
desired_stats_cnt = len(desired_stats_cats)

filter_stats_df = pga_data_h_df.copy()

# Filter for desired statistics
filter_stats_df = filter_stats_df.loc[filter_stats_df["Statistic"].isin(desired_stats_cats),:]


# Remove players who don't have these stats for every year in our dataset
# ---------------------------------------------------------------------------
# Get a list of players in our filtered_stats that have stats for every season
filter_stats_player_cnt = filter_stats_df.copy()
filter_stats_player_cnt = filter_stats_player_cnt[["Player Name", "Season"]]
filter_stats_player_cnt.drop_duplicates(inplace=True)
filter_stats_player_cnt = filter_stats_player_cnt.groupby("Player Name").count()
filter_stats_player_cnt.reset_index(inplace=True)
filter_stats_player_cnt = filter_stats_player_cnt.loc[filter_stats_player_cnt["Season"] == 9,:]

#filter_stats_player_cnt.to_csv("output_data/FilteredPlayers.csv")

# Filter for players in this list
filter_stats_df = filter_stats_df.loc[filter_stats_df["Player Name"].isin(filter_stats_player_cnt["Player Name"]),:]


# Remove players who don't have all of our stat categories for every season
# ---------------------------------------------------------------------------
players_to_keep_lst = []

df1 = filter_stats_df.copy()
df1 = df1[["Player Name", "Season", "Statistic"]]
df1.drop_duplicates(inplace=True)

# Count number of statistics per Player Name-Season
df1 = df1.groupby(["Player Name", "Season"]).count()
df1.reset_index(inplace=True)
# Remove Season column and sum Statistic counts
df1 = df1[["Player Name", "Statistic"]]
df1 = df1.groupby("Player Name").sum()
df1 = df1.loc[df1["Statistic"] == (len(seasons_lst) * desired_stats_cnt),:]
players_to_keep_lst = list(df1.index.values)

# Filter for players in this list
filter_stats_df = filter_stats_df.loc[filter_stats_df["Player Name"].isin(players_to_keep_lst),:]
filter_stats_df.reset_index(inplace=True, drop=True)

# Drop the Statistic column, and just keep the Variable column
filter_stats_df.drop(["Statistic"], axis=1, inplace=True)
# Rename Variable column
filter_stats_df.rename(columns={"Variable" : "Statistic"}, inplace=True)


filter_stats_df


Unnamed: 0,Player Name,Season,Statistic,Value
0,Robert Garrigus,2010,Driving Distance - (ROUNDS),71
1,Bubba Watson,2010,Driving Distance - (ROUNDS),77
2,Dustin Johnson,2010,Driving Distance - (ROUNDS),83
3,Phil Mickelson,2010,Driving Distance - (ROUNDS),76
4,Aaron Baddeley,2010,Driving Distance - (ROUNDS),94
...,...,...,...,...
10318,Matt Jones,2018,Short Game Rating - (RATING),5.8
10319,Ryan Palmer,2018,Short Game Rating - (RATING),5.8
10320,J.J. Henry,2018,Short Game Rating - (RATING),5.6
10321,Hunter Mahan,2018,Short Game Rating - (RATING),5.4


In [36]:
desired_columns_lst = ["Player Name",
                       "Season",
                       "Driving Distance - (ROUNDS)",
                       "Driving Distance - (AVG.)", 
                       "Driving Distance - (TOTAL DISTANCE)", 
                       "Driving Distance - (TOTAL DRIVES)",
                       "Putting Average - (ROUNDS)",
                       "Putting Average - (AVG)",
                       "Putting Average - (GIR PUTTS)",
                       "Putting Average - (GREENS HIT)",
                       "Putting Average - (BIRDIE CONVERSION)",
                       "Putting Average - (GIR RANK)",
                       "Total Driving - (EVENTS)",
                       "Total Driving - (TOTAL)",
                       "Total Driving - (DISTANCE RANK)",
                       "Total Driving - (ACCURACY RANK)",
                       "Total Money (Official and Unofficial) - (EVENTS)",
                       "Total Money (Official and Unofficial) - (MONEY)",
                       "Proximity to Hole (ARG) - (ROUNDS)",
                       "Proximity to Hole (ARG) - (AVG DTP)",
                       "Proximity to Hole (ARG) - (TOTAL DISTANCE (FEET))",
                       "Proximity to Hole (ARG) - (# OF SHOTS)",
                       "Proximity to Hole (ARG) - (SCRAMBLING RANK)",
                       "Smash Factor - (ROUNDS)",
                       "Smash Factor - (AVG.)",
                       "Smash Factor - (TOTAL SMASH FACTOR)",
                       "Smash Factor - (TOTAL ATTEMPTS)",
                       "Smash Factor - (HIGHEST VALUE)",
                       "Smash Factor - (LOWEST VALUE)",
                       "Total Putting - (EVENTS)",
                       "Total Putting - (TOTAL)",
                       "Short Game Rating - (EVENTS)",
                       "Short Game Rating - (RATING)"]

final_cleaned_df = pd.DataFrame(columns = desired_columns_lst)


for index, rows in filter_stats_df.iterrows():  
    # Search if name and season is in the dataframe, if not add
    search = final_cleaned_df.loc[(final_cleaned_df["Player Name"] == rows["Player Name"]) & (final_cleaned_df["Season"] == rows["Season"])]
    if len(search) == 0:
        # Name and season not found.  Insert name, season, and statistic
        data = [{"Player Name" : rows["Player Name"],
                 "Season" : rows["Season"],
                 rows["Statistic"] : rows["Value"]}]
        final_cleaned_df = final_cleaned_df.append(data, ignore_index=True, sort=False)
    else:
        # Name and season found.  Just add the statistic
        final_cleaned_df.loc[(final_cleaned_df["Player Name"] == rows["Player Name"]) & (final_cleaned_df["Season"] == rows["Season"]), rows["Statistic"]] = rows["Value"]    
    


In [35]:
final_cleaned_df.to_csv("output_data/Final_Cleaned.csv")
final_cleaned_df


Unnamed: 0,Player Name,Season,Driving Distance - (ROUNDS),Driving Distance - (AVG.),Driving Distance - (TOTAL DISTANCE),Driving Distance - (TOTAL DRIVES),Putting Average - (ROUNDS),Putting Average - (AVG),Putting Average - (GIR PUTTS),Putting Average - (GREENS HIT),...,Smash Factor - (ROUNDS),Smash Factor - (AVG.),Smash Factor - (TOTAL SMASH FACTOR),Smash Factor - (TOTAL ATTEMPTS),Smash Factor - (HIGHEST VALUE),Smash Factor - (LOWEST VALUE),Total Putting - (EVENTS),Total Putting - (TOTAL),Short Game Rating - (EVENTS),Short Game Rating - (RATING)
0,Robert Garrigus,2010,71,315.5,41009,130,71,1.786,1413,791,...,71,1.475,64.900,44,1.485,1.439,22,245.2,12,5.5
1,Bubba Watson,2010,77,309.8,47703,154,77,1.763,1675,950,...,77,1.477,101.947,69,1.485,1.437,22,219.9,13,5.6
2,Dustin Johnson,2010,83,308.5,50588,164,83,1.767,1772,1003,...,83,1.479,96.131,65,1.485,1.427,23,239.6,13,5.8
3,Phil Mickelson,2010,76,299.1,45459,152,76,1.762,1570,891,...,76,1.475,72.272,49,1.500,1.389,20,191.4,13,6.8
4,Aaron Baddeley,2010,94,298.9,56202,188,94,1.735,1896,1093,...,94,1.476,119.525,81,1.491,1.429,26,92.4,17,6.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,Chad Campbell,2018,87,286.5,40684,142,87,1.815,1692,932,...,87,1.494,74.718,50,1.513,1.427,28,328.5,11,6.2
329,Johnson Wagner,2018,75,286.1,39487,138,75,1.754,1479,843,...,75,1.486,69.820,47,1.518,1.398,21,79.0,11,6.4
330,Ben Crane,2018,65,281.1,30921,110,65,1.762,1214,689,...,65,1.491,67.108,45,1.514,1.413,21,152.5,7,6.0
331,D.A. Points,2018,63,280.8,32015,114,63,1.778,1175,661,...,63,1.484,62.327,42,1.518,1.437,25,149.2,4,5.2
