## Project 1
### PGA Tour Golf Stats

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import os

In [2]:
# Declare input file paths and file names
pga_data_hist_file = os.path.join(".", "input_data", "PGA_Data_Historical.csv")
player_exp_file = os.path.join(".", "input_data", "player_experience_data.csv")


In [3]:
# Load main player and stat data into dataframe
all_pga_data_df = pd.read_csv(pga_data_hist_file)

# Load player experience lookup table
player_exp_df = pd.read_csv(player_exp_file)


FileNotFoundError: [Errno 2] File ./input_data/PGA_Data_Historical.csv does not exist: './input_data/PGA_Data_Historical.csv'

In [None]:
# Merge player experience into main dataframe
all_pga_data_df = all_pga_data_df.merge(player_exp_df, how='left', on="Player Name")
all_pga_data_df.head()


In [13]:
# Create a filtered dataframe with only the stats we are interested in

# Define list of stats to keep
keep_variable_stats =  ["Season",
                        "Driving Distance - (ROUNDS)",
                        "Driving Distance - (AVG.)",
                        "Driving Distance - (TOTAL DRIVES)",
                        "Putting Average - (ROUNDS)",
                        "Putting Average - (AVG)",
                        "Total Driving - (EVENTS)",
                        "Total Driving - (TOTAL)",
                        "Total Driving - (DISTANCE RANK)",
                        "Total Driving - (ACCURACY RANK)",
                        "Total Money (Official and Unofficial) - (EVENTS)",
                        "Total Money (Official and Unofficial) - (MONEY)" , 
                        "Proximity to Hole (ARG) - (ROUNDS)",
                        "Proximity to Hole (ARG) - (AVG DTP)",
                        "Proximity to Hole (ARG) - (# OF SHOTS)",
                        "Proximity to Hole (ARG) - (SCRAMBLING RANK)",
                        "Smash Factor - (ROUNDS)",
                        "Smash Factor - (AVG.)",
                        "Smash Factor - (TOTAL SMASH FACTOR)",
                        "Smash Factor - (TOTAL ATTEMPTS)",
                        "Total Putting - (EVENTS)",
                        "Total Putting - (TOTAL)",
                        "Short Game Rating - (EVENTS)",
                        "Short Game Rating - (RATING)"]

# Define count of stats we are keeping for each player
desired_var_stats_cnt = len(keep_variable_stats)

# Create filtered dataframe
temp_filtered_stats = all_pga_data_df.copy()
temp_filtered_stats = temp_filtered_stats.loc[temp_filtered_stats["Variable"].isin(keep_variable_stats),:]

# Drop the Statistic column, and just keep the Variable column
temp_filtered_stats.drop(["Statistic"], axis=1, inplace=True)

# Rename Variable column
temp_filtered_stats.rename(columns={"Variable" : "Statistic"}, inplace=True)

temp_filtered_stats.head()

Unnamed: 0,Player Name,Season,Statistic,Value,Exp-Before-2010_x,Exp-Less_Eq-2012_x,Exp-Less_Eq-2015_x,Exp-Less_Eq-2018_x,Exp-Before-2010_y,Exp-Less_Eq-2012_y,Exp-Less_Eq-2015_y,Exp-Less_Eq-2018_y,Exp-Before-2010,Exp-Less_Eq-2012,Exp-Less_Eq-2015,Exp-Less_Eq-2018
0,Robert Garrigus,2010,Driving Distance - (ROUNDS),71,4.0,7.0,10.0,13.0,4.0,7.0,10.0,13.0,4.0,7.0,10.0,13.0
1,Bubba Watson,2010,Driving Distance - (ROUNDS),77,4.0,7.0,10.0,13.0,4.0,7.0,10.0,13.0,4.0,7.0,10.0,13.0
2,Dustin Johnson,2010,Driving Distance - (ROUNDS),83,2.0,5.0,8.0,11.0,2.0,5.0,8.0,11.0,2.0,5.0,8.0,11.0
3,Brett Wetterich,2010,Driving Distance - (ROUNDS),54,6.0,9.0,9.0,9.0,6.0,9.0,9.0,9.0,6.0,9.0,9.0,9.0
4,J.B. Holmes,2010,Driving Distance - (ROUNDS),100,4.0,7.0,10.0,13.0,4.0,7.0,10.0,13.0,4.0,7.0,10.0,13.0


In [17]:
# Create new dataframe to move our statistics from row values to columns

# Define columns of the new dataframe
desired_columns_lst = ["Player Name",
                       "Season",
                       "Driving Distance - (ROUNDS)",
                       "Driving Distance - (AVG.)", 
                       "Driving Distance - (TOTAL DRIVES)",
                       "Putting Average - (ROUNDS)",
                       "Putting Average - (AVG)",
                       "Total Driving - (EVENTS)",
                       "Total Driving - (TOTAL)",
                       "Total Driving - (DISTANCE RANK)",
                       "Total Driving - (ACCURACY RANK)",
                       "Total Money (Official and Unofficial) - (EVENTS)",
                       "Total Money (Official and Unofficial) - (MONEY)",
                       "Proximity to Hole (ARG) - (ROUNDS)",
                       "Proximity to Hole (ARG) - (AVG DTP)",
                       "Proximity to Hole (ARG) - (# OF SHOTS)",
                       "Proximity to Hole (ARG) - (SCRAMBLING RANK)",
                       "Smash Factor - (ROUNDS)",
                       "Smash Factor - (AVG.)",
                       "Smash Factor - (TOTAL SMASH FACTOR)",
                       "Smash Factor - (TOTAL ATTEMPTS)",
                       "Total Putting - (EVENTS)",
                       "Total Putting - (TOTAL)",
                       "Short Game Rating - (EVENTS)",
                       "Short Game Rating - (RATING)",
                       "Exp-Before-2010",
                       "Exp-Less_Eq-2012",
                       "Exp-Less_Eq-2015",
                       "Exp-Less_Eq-2018"]
                
cln_all_pga_data_df = pd.DataFrame(columns = desired_columns_lst)


for index, rows in temp_filtered_stats.iterrows():  
    # Search if name and season is in the dataframe, if not add
    search = cln_all_pga_data_df.loc[(cln_all_pga_data_df["Player Name"] == rows["Player Name"]) & (cln_all_pga_data_df["Season"] == rows["Season"])]
    if len(search) == 0:
        # Name and season not found.  Insert name, season, and statistic
        data = [{"Player Name" : rows["Player Name"],
                 "Season" : rows["Season"],
                 rows["Statistic"] : rows["Value"],
                 "Exp-Before-2010" : rows["Exp-Before-2010"],
                 "Exp-Less_Eq-2012" : rows["Exp-Less_Eq-2012"],
                 "Exp-Less_Eq-2015" : rows["Exp-Less_Eq-2015"],
                 "Exp-Less_Eq-2018" : rows["Exp-Less_Eq-2018"]}]            
        cln_all_pga_data_df = cln_all_pga_data_df.append(data, ignore_index=True, sort=False)
    else:
        # Name and season found.  Just add the statistic
        cln_all_pga_data_df.loc[(cln_all_pga_data_df["Player Name"] == rows["Player Name"]) & (cln_all_pga_data_df["Season"] == rows["Season"]), rows["Statistic"]] = rows["Value"]    



In [18]:
cln_all_pga_data_df

Unnamed: 0,Player Name,Season,Driving Distance - (ROUNDS),Driving Distance - (AVG.),Driving Distance - (TOTAL DRIVES),Putting Average - (ROUNDS),Putting Average - (AVG),Total Driving - (EVENTS),Total Driving - (TOTAL),Total Driving - (DISTANCE RANK),...,Smash Factor - (TOTAL SMASH FACTOR),Smash Factor - (TOTAL ATTEMPTS),Total Putting - (EVENTS),Total Putting - (TOTAL),Short Game Rating - (EVENTS),Short Game Rating - (RATING),Exp-Before-2010,Exp-Less_Eq-2012,Exp-Less_Eq-2015,Exp-Less_Eq-2018
0,Robert Garrigus,2010,71,315.5,130,71,1.786,22,176,1,...,64.900,44,22,245.2,12,5.5,4.0,7.0,10.0,13.0
1,Bubba Watson,2010,77,309.8,154,77,1.763,22,178,2,...,101.947,69,22,219.9,13,5.6,4.0,7.0,10.0,13.0
2,Dustin Johnson,2010,83,308.5,164,83,1.767,23,175,3,...,96.131,65,23,239.6,13,5.8,2.0,5.0,8.0,11.0
3,Brett Wetterich,2010,54,307.3,108,54,1.801,16,166,4,...,57.606,39,16,200.0,11,6.0,6.0,9.0,9.0,9.0
4,J.B. Holmes,2010,100,307.2,200,100,1.770,26,188,5,...,106.525,72,26,234.8,21,6.2,4.0,7.0,10.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3720,Collin Morikawa,2018,,,,,,,,,...,,,,,1,6.8,0.0,0.0,0.0,1.0
3721,Braden Thornberry,2018,,,,,,,,,...,,,,,1,5.9,0.0,0.0,0.0,2.0
3722,Matt Parziale,2018,,,,,,,,,...,,,,,1,5.3,0.0,0.0,0.0,1.0
3723,Will Grimmer,2018,,,,,,,,,...,,,,,1,4.8,0.0,0.0,0.0,1.0


In [27]:
# Drop rows with NaN values
cln_all_pga_data_df.dropna(axis=0, how='any', inplace=True)

# Convert stat columns to appropriate datatypes

# Define a conversion dictionary
conversion_dict = {"Season" : "int64",
                   "Driving Distance - (ROUNDS)" : "int64",
                   "Driving Distance - (AVG.)" : "float64",
                   "Driving Distance - (TOTAL DRIVES)" : "int64",
                   "Putting Average - (ROUNDS)" : "int64",
                   "Putting Average - (AVG)" : "float64",
                    "Total Driving - (EVENTS)" : "int64",
                    "Total Driving - (TOTAL)" : "int64",
                    "Total Driving - (DISTANCE RANK)" : "int64",
                    "Total Driving - (ACCURACY RANK)" : "int64",
# NEEDS CONVERSION  "Total Money (Official and Unofficial) - (EVENTS)"
# NEEDS CONVERSION  "Total Money (Official and Unofficial) - (MONEY)"  
# NEEDS CONVERSION  "Proximity to Hole (ARG) - (ROUNDS)"
# NEEDS CONVERSION  "Proximity to Hole (ARG) - (AVG DTP)"
                    "Proximity to Hole (ARG) - (# OF SHOTS)"  : "int64",
                    "Proximity to Hole (ARG) - (SCRAMBLING RANK)" : "int64",
                    "Smash Factor - (ROUNDS)" : "int64",
                    "Smash Factor - (AVG.)" : "float64",
                    "Smash Factor - (TOTAL SMASH FACTOR)" : "float64",
                    "Smash Factor - (TOTAL ATTEMPTS)" : "int64",
                    "Total Putting - (EVENTS)" : "int64",
                    "Total Putting - (TOTAL)" : "float64",
                    "Short Game Rating - (EVENTS)" : "int64",
                    "Short Game Rating - (RATING)" : "float64"
                  }

# Convert column datatypes, ignore errors and keep any NaNs if they exist
cln_all_pga_data_df = cln_all_pga_data_df.astype(conversion_dict, errors="ignore")

cln_all_pga_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1661 entries, 0 to 3488
Data columns (total 29 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Player Name                                       1661 non-null   object 
 1   Season                                            1661 non-null   int64  
 2   Driving Distance - (ROUNDS)                       1661 non-null   int64  
 3   Driving Distance - (AVG.)                         1661 non-null   float64
 4   Driving Distance - (TOTAL DRIVES)                 1661 non-null   int64  
 5   Putting Average - (ROUNDS)                        1661 non-null   int64  
 6   Putting Average - (AVG)                           1661 non-null   float64
 7   Total Driving - (EVENTS)                          1661 non-null   int64  
 8   Total Driving - (TOTAL)                           1661 non-null   int64  
 9   Total Driving - (DI

In [40]:
#cln_all_pga_data_df.to_csv("output_data/cln_all_pga_data_df.csv", index=False)

In [41]:
# If we need these, this is how to split our dataframe into groups

# Create dataframes for groups of 3 years
cln_2010_2012_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2010", "2011", "2012"]),:]
cln_2010_2012_df.reset_index(inplace=True)

cln_2013_2015_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2013", "2014", "2015"]),:]
cln_2013_2015_df.reset_index(inplace=True)

cln_2016_2018_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2016", "2017", "2018"]),:]
cln_2016_2018_df.reset_index(inplace=True)

# Create dataframes for groups of 4 years
cln_2011_2014_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2011", "2012", "2013", "2014"]),:]
cln_2011_2014_df.reset_index(inplace=True)

cln_2015_2018_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2015", "2016", "2017", "2018"]),:]
cln_2015_2018_df.reset_index(inplace=True)

    