## Project 1
### PGA Tour Golf Stats

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import seaborn as sns
import os
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import linregress

ModuleNotFoundError: No module named 'statsmodels'

## Data Import and Cleaning

In [None]:
# Declare input file paths and file names
pga_data_hist_file = os.path.join(".", "input_data", "PGA_Data_Historical.csv")
player_exp_file = os.path.join(".", "input_data", "Scoring_Average_1980_2019.csv")


In [None]:
# Load main player and stat data into dataframe
all_pga_data_df = pd.read_csv(pga_data_hist_file)

# Load player experience lookup table
player_exp_df = pd.read_csv(player_exp_file)

In [None]:
# Data cleaning on the player experience input file CSV

# Get count of years each player has been on tour
# Only count years prior to 2019
player_exp_df["Year"] = pd.DatetimeIndex(player_exp_df["DATE"]).year
player_exp_df = player_exp_df.loc[player_exp_df["Year"] < 2019,:]
player_exp_df.drop(["DATE", "TOURNAMENT", "Scoring Average - (AVG)", "Scoring Average - (RANK THIS WEEK)"], axis=1, inplace=True)
player_exp_df.drop_duplicates(inplace=True)
player_exp_df["Years_on_Tour"] = player_exp_df.groupby("PLAYER_NAME").transform("count")
player_exp_df.drop("Year", axis=1, inplace=True)
player_exp_df.drop_duplicates(inplace=True)

# Merge player experience into main dataframe
all_pga_data_df = all_pga_data_df.merge(player_exp_df, how='left', left_on="Player Name", right_on="PLAYER_NAME")
all_pga_data_df.drop("PLAYER_NAME", axis=1, inplace=True)

In [None]:
# Create a filtered dataframe with only the stats we are interested in

# Define list of stats to keep
keep_variable_stats =  ["Season",
                        "Driving Distance - (AVG.)",
                        "Smash Factor - (AVG.)",
                        "Short Game Rating - (RATING)",
                        "Proximity to Hole (ARG) - (AVG DTP)",
                        "Putting Average - (AVG)",
                        "Total Money (Official and Unofficial) - (MONEY)"]

# Define count of stats we are keeping for each player
desired_var_stats_cnt = len(keep_variable_stats)

# Create filtered dataframe
temp_filtered_stats = all_pga_data_df.copy()
temp_filtered_stats = temp_filtered_stats.loc[temp_filtered_stats["Variable"].isin(keep_variable_stats),:]

# Drop the Statistic column, and just keep the Variable column
temp_filtered_stats.drop(["Statistic"], axis=1, inplace=True)

# Rename Variable column
temp_filtered_stats.rename(columns={"Variable" : "Statistic"}, inplace=True)

temp_filtered_stats.head()

In [None]:
# Create new dataframe to move our statistics from row values to columns

# Define columns of the new dataframe
desired_columns_lst = ["Player Name",
                       "Season",
                       "Driving Distance - (AVG.)", 
                       "Smash Factor - (AVG.)",
                       "Short Game Rating - (RATING)",
                       "Proximity to Hole (ARG) - (AVG DTP)",
                       "Putting Average - (AVG)",
                       "Total Money (Official and Unofficial) - (MONEY)",
                       "Years_on_Tour"]
                
cln_all_pga_data_df = pd.DataFrame(columns = desired_columns_lst)


for index, rows in temp_filtered_stats.iterrows():  
    # Search if name and season is in the dataframe, if not add
    search = cln_all_pga_data_df.loc[(cln_all_pga_data_df["Player Name"] == rows["Player Name"]) & (cln_all_pga_data_df["Season"] == rows["Season"])]
    if len(search) == 0:
        # Name and season not found.  Insert name, season, and statistic
        data = [{"Player Name" : rows["Player Name"],
                 "Season" : rows["Season"],
                 rows["Statistic"] : rows["Value"],
                 "Years_on_Tour" : rows["Years_on_Tour"]}]            
        cln_all_pga_data_df = cln_all_pga_data_df.append(data, ignore_index=True, sort=False)
    else:
        # Name and season found.  Just add the statistic
        cln_all_pga_data_df.loc[(cln_all_pga_data_df["Player Name"] == rows["Player Name"]) & (cln_all_pga_data_df["Season"] == rows["Season"]), rows["Statistic"]] = rows["Value"]    

In [None]:
# Data cleaning - Drop rows with NaN values
cln_all_pga_data_df.dropna(axis=0, how='any', inplace=True)


# Data cleaning - Convert the Proximity to Hole (ARG) - (AVG DTP) to inches
# Define function to perform conversion
def conv_dtp_to_inches(input_dtp):
    split_str = input_dtp.split("' ")
    feet = float(split_str[0])
    inches = float(split_str[1].replace("\"",""))
    return (12 * feet) + inches

cln_all_pga_data_df["Proximity to Hole (ARG) - (AVG DTP) (IN)"] = cln_all_pga_data_df["Proximity to Hole (ARG) - (AVG DTP)"].apply(lambda x:conv_dtp_to_inches(x))

# Drop old Proximity to Hole (ARG) - (AVG DTP) column
cln_all_pga_data_df.drop("Proximity to Hole (ARG) - (AVG DTP)", axis=1, inplace=True)


# Data cleaning - Remove dollar sign and commas from earnings column

cln_all_pga_data_df["Total Money (Official and Unofficial) - (MONEY)"] = cln_all_pga_data_df["Total Money (Official and Unofficial) - (MONEY)"].str.replace('$','')
cln_all_pga_data_df["Total Money (Official and Unofficial) - (MONEY)"] = cln_all_pga_data_df["Total Money (Official and Unofficial) - (MONEY)"].str.replace(',','')


In [None]:
# Rearrange columns
column_order = ["Player Name",
                "Season",
                "Driving Distance - (AVG.)",
                "Smash Factor - (AVG.)",
                "Short Game Rating - (RATING)",
                "Proximity to Hole (ARG) - (AVG DTP) (IN)",
                "Putting Average - (AVG)",
                "Total Money (Official and Unofficial) - (MONEY)",
                "Years_on_Tour"]

cln_all_pga_data_df = cln_all_pga_data_df.reindex(columns=column_order)


In [None]:
cln_all_pga_data_df.head()

In [None]:
# Convert stat columns to appropriate datatypes

# Define a conversion dictionary
conversion_dict = {"Season" : "int64",
                   "Driving Distance - (AVG.)" : "float64",
                   "Smash Factor - (AVG.)" : "float64",
                   "Short Game Rating - (RATING)" : "float64",
                   "Proximity to Hole (ARG) - (AVG DTP) (IN)" : "float64",
                   "Putting Average - (AVG)" : "float64",
                   "Total Money (Official and Unofficial) - (MONEY)" : "float64",
                   "Years_on_Tour" : "int64"                  
                 }

# Convert column datatypes, ignore errors and keep any NaNs if they exist
cln_all_pga_data_df = cln_all_pga_data_df.astype(conversion_dict, errors="ignore")

cln_all_pga_data_df.info()

In [None]:
# Output to CSV or Read from CSV using this cell

#cln_all_pga_data_df.to_csv("output_data/cln_all_pga_data_df.csv", index=False)

## Total Column, Row, and Player Counts

In [None]:
# Number of Rows
print(f"There are {cln_all_pga_data_df.shape[0]} rows and {cln_all_pga_data_df.shape[1]} columns in our dataset.")

In [None]:
# Number of Players
nbr_total_players = len(cln_all_pga_data_df["Player Name"].unique())
print(f"There are {nbr_total_players} unique players in our dataset.")

In [None]:
# Number of Players Per Season
player_cnt_by_season = cln_all_pga_data_df.copy()
player_cnt_by_season = player_cnt_by_season[["Player Name", "Season"]].drop_duplicates().groupby("Season").count().rename(columns={"Player Name" : "Player Count"})
player_cnt_by_season

## Create Statistical Summary Tables

In [None]:
# DEFINE FUNCTION TO CALCULATE SUMMARY STATS 
def calculate_summ_stats(var_of_interest):
    for season in season_lst:
        driv_avg_data = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"] == season, var_of_interest]
        mean_lst.append([season, np.mean(driv_avg_data)])
        median_lst.append([season, np.median(driv_avg_data)])
        variance_lst.append([season, np.var(driv_avg_data)])
        std_dev_lst.append([season, np.std(driv_avg_data)])
        SEM_lst.append([season, st.sem(driv_avg_data)])
        k2, p = st.normaltest(driv_avg_data)
        normal_p_lst.append([season, p])
        

In [None]:
# CREATE STAT SUMMARY TABLE #1 - Driving Distance - (AVG.)

var_of_interest = "Driving Distance - (AVG.)"
season_lst = cln_all_pga_data_df["Season"].unique()
mean_lst = []
median_lst = []
variance_lst = []
std_dev_lst = []
SEM_lst = []
normal_p_lst = []

calculate_summ_stats(var_of_interest)

# Convert lists to dataframes.  We will later combine these together
mean_df = pd.DataFrame(mean_lst)
mean_df = mean_df.rename(columns={0:"Season", 1:"Mean Driving Distance (AVG)"})
median_df = pd.DataFrame(median_lst)
median_df = median_df.rename(columns={0:"Season", 1:"Median Driving Distance (AVG)"})
variance_df = pd.DataFrame(variance_lst)
variance_df = variance_df.rename(columns={0:"Season", 1:"Variance Driving Distance (AVG)"})
std_dev_df = pd.DataFrame(std_dev_lst)
std_dev_df = std_dev_df.rename(columns={0:"Season", 1:"Std Dev Driving Distance (AVG)"})
SEM_df = pd.DataFrame(SEM_lst)
SEM_df = SEM_df.rename(columns={0:"Season", 1:"SEM Driving Distance (AVG)"})
normal_p_df = pd.DataFrame(normal_p_lst)
normal_p_df = normal_p_df.rename(columns={0:"Season", 1:"Normality (p) Driving Distance (AVG)"})

# Create combined dataframe of summary stats
driv_stat_summ_df = mean_df.merge(median_df, on="Season", how='left')
driv_stat_summ_df = driv_stat_summ_df.merge(variance_df, on="Season", how='left')
driv_stat_summ_df = driv_stat_summ_df.merge(std_dev_df, on="Season", how='left')
driv_stat_summ_df = driv_stat_summ_df.merge(SEM_df, on="Season", how='left')
driv_stat_summ_df = driv_stat_summ_df.merge(normal_p_df, on="Season", how='left')
driv_stat_summ_df.set_index("Season", inplace=True)

driv_stat_summ_df

In [None]:
# CREATE STAT SUMMARY TABLE #2 - Smash Factor - (AVG.)

var_of_interest = "Smash Factor - (AVG.)"
season_lst = cln_all_pga_data_df["Season"].unique()
mean_lst = []
median_lst = []
variance_lst = []
std_dev_lst = []
SEM_lst = []
normal_p_lst = []

calculate_summ_stats(var_of_interest)

# Convert lists to dataframes.  We will later combine these together
mean_df = pd.DataFrame(mean_lst)
mean_df = mean_df.rename(columns={0:"Season", 1:"Mean Smash Factor (AVG)"})
median_df = pd.DataFrame(median_lst)
median_df = median_df.rename(columns={0:"Season", 1:"Median Smash Factor (AVG)"})
variance_df = pd.DataFrame(variance_lst)
variance_df = variance_df.rename(columns={0:"Season", 1:"Variance Smash Factor (AVG)"})
std_dev_df = pd.DataFrame(std_dev_lst)
std_dev_df = std_dev_df.rename(columns={0:"Season", 1:"Std Dev Smash Factor (AVG)"})
SEM_df = pd.DataFrame(SEM_lst)
SEM_df = SEM_df.rename(columns={0:"Season", 1:"SEM Smash Factor (AVG)"})
normal_p_df = pd.DataFrame(normal_p_lst)
normal_p_df = normal_p_df.rename(columns={0:"Season", 1:"Normality (p) Smash Factor (AVG)"})

# Create combined dataframe of summary stats
smash_stat_summ_df = mean_df.merge(median_df, on="Season", how='left')
smash_stat_summ_df = smash_stat_summ_df.merge(variance_df, on="Season", how='left')
smash_stat_summ_df = smash_stat_summ_df.merge(std_dev_df, on="Season", how='left')
smash_stat_summ_df = smash_stat_summ_df.merge(SEM_df, on="Season", how='left')
smash_stat_summ_df = smash_stat_summ_df.merge(normal_p_df, on="Season", how='left')
smash_stat_summ_df.set_index("Season", inplace=True)

smash_stat_summ_df

In [None]:
# CREATE STAT SUMMARY TABLE #3 - Short Game Rating - (RATING)

var_of_interest = "Short Game Rating - (RATING)"
season_lst = cln_all_pga_data_df["Season"].unique()
mean_lst = []
median_lst = []
variance_lst = []
std_dev_lst = []
SEM_lst = []
normal_p_lst = []

calculate_summ_stats(var_of_interest)

# Convert lists to dataframes.  We will later combine these together
mean_df = pd.DataFrame(mean_lst)
mean_df = mean_df.rename(columns={0:"Season", 1:"Mean Short Game Rating"})
median_df = pd.DataFrame(median_lst)
median_df = median_df.rename(columns={0:"Season", 1:"Median Short Game Rating"})
variance_df = pd.DataFrame(variance_lst)
variance_df = variance_df.rename(columns={0:"Season", 1:"Variance Short Game Rating"})
std_dev_df = pd.DataFrame(std_dev_lst)
std_dev_df = std_dev_df.rename(columns={0:"Season", 1:"Std Dev Short Game Rating"})
SEM_df = pd.DataFrame(SEM_lst)
SEM_df = SEM_df.rename(columns={0:"Season", 1:"SEM Short Game Rating"})
normal_p_df = pd.DataFrame(normal_p_lst)
normal_p_df = normal_p_df.rename(columns={0:"Season", 1:"Normality (p) Short Game Rating"})

# Create combined dataframe of summary stats
sht_gm_rat_summ_df = mean_df.merge(median_df, on="Season", how='left')
sht_gm_rat_summ_df = sht_gm_rat_summ_df.merge(variance_df, on="Season", how='left')
sht_gm_rat_summ_df = sht_gm_rat_summ_df.merge(std_dev_df, on="Season", how='left')
sht_gm_rat_summ_df = sht_gm_rat_summ_df.merge(SEM_df, on="Season", how='left')
sht_gm_rat_summ_df = sht_gm_rat_summ_df.merge(normal_p_df, on="Season", how='left')
sht_gm_rat_summ_df.set_index("Season", inplace=True)

sht_gm_rat_summ_df

In [None]:
# CREATE STAT SUMMARY TABLE #4 - Proximity to Hole (ARG) - (AVG DTP) (IN)

var_of_interest = "Proximity to Hole (ARG) - (AVG DTP) (IN)"
season_lst = cln_all_pga_data_df["Season"].unique()
mean_lst = []
median_lst = []
variance_lst = []
std_dev_lst = []
SEM_lst = []
normal_p_lst = []

calculate_summ_stats(var_of_interest)

# Convert lists to dataframes.  We will later combine these together
mean_df = pd.DataFrame(mean_lst)
mean_df = mean_df.rename(columns={0:"Season", 1:"Mean Proximity ARG (IN)"})
median_df = pd.DataFrame(median_lst)
median_df = median_df.rename(columns={0:"Season", 1:"Median Proximity ARG (IN)"})
variance_df = pd.DataFrame(variance_lst)
variance_df = variance_df.rename(columns={0:"Season", 1:"Variance Proximity ARG (IN)"})
std_dev_df = pd.DataFrame(std_dev_lst)
std_dev_df = std_dev_df.rename(columns={0:"Season", 1:"Std Dev Proximity ARG (IN)"})
SEM_df = pd.DataFrame(SEM_lst)
SEM_df = SEM_df.rename(columns={0:"Season", 1:"SEM Proximity ARG (IN)"})
normal_p_df = pd.DataFrame(normal_p_lst)
normal_p_df = normal_p_df.rename(columns={0:"Season", 1:"Normality (p) Proximity ARG (IN)"})

# Create combined dataframe of summary stats
prox_arg_stat_summ_df = mean_df.merge(median_df, on="Season", how='left')
prox_arg_stat_summ_df = prox_arg_stat_summ_df.merge(variance_df, on="Season", how='left')
prox_arg_stat_summ_df = prox_arg_stat_summ_df.merge(std_dev_df, on="Season", how='left')
prox_arg_stat_summ_df = prox_arg_stat_summ_df.merge(SEM_df, on="Season", how='left')
prox_arg_stat_summ_df = prox_arg_stat_summ_df.merge(normal_p_df, on="Season", how='left')
prox_arg_stat_summ_df.set_index("Season", inplace=True)

prox_arg_stat_summ_df

In [None]:
# CREATE STAT SUMMARY TABLE #5 - Putting Average - (AVG)

var_of_interest = "Putting Average - (AVG)"
season_lst = cln_all_pga_data_df["Season"].unique()
mean_lst = []
median_lst = []
variance_lst = []
std_dev_lst = []
SEM_lst = []
normal_p_lst = []

calculate_summ_stats(var_of_interest)

# Convert lists to dataframes.  We will later combine these together
mean_df = pd.DataFrame(mean_lst)
mean_df = mean_df.rename(columns={0:"Season", 1:"Mean Putting Average"})
median_df = pd.DataFrame(median_lst)
median_df = median_df.rename(columns={0:"Season", 1:"Median Putting Average"})
variance_df = pd.DataFrame(variance_lst)
variance_df = variance_df.rename(columns={0:"Season", 1:"Variance Putting Average"})
std_dev_df = pd.DataFrame(std_dev_lst)
std_dev_df = std_dev_df.rename(columns={0:"Season", 1:"Std Dev Putting Average"})
SEM_df = pd.DataFrame(SEM_lst)
SEM_df = SEM_df.rename(columns={0:"Season", 1:"SEM Putting Average"})
normal_p_df = pd.DataFrame(normal_p_lst)
normal_p_df = normal_p_df.rename(columns={0:"Season", 1:"Normality (p) Putting Average"})

# Create combined dataframe of summary stats
putt_avg_stat_summ_df = mean_df.merge(median_df, on="Season", how='left')
putt_avg_stat_summ_df = putt_avg_stat_summ_df.merge(variance_df, on="Season", how='left')
putt_avg_stat_summ_df = putt_avg_stat_summ_df.merge(std_dev_df, on="Season", how='left')
putt_avg_stat_summ_df = putt_avg_stat_summ_df.merge(SEM_df, on="Season", how='left')
putt_avg_stat_summ_df = putt_avg_stat_summ_df.merge(normal_p_df, on="Season", how='left')
putt_avg_stat_summ_df.set_index("Season", inplace=True)

putt_avg_stat_summ_df

## Analyze Driving Distance Average

In [None]:
# BOXPLOT FOR AVERAGE DRIVING DISTANCE

var_of_interest = "Driving Distance - (AVG.)"

driv_dist_2010 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2010,var_of_interest]
driv_dist_2011 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2011,var_of_interest]
driv_dist_2012 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2012,var_of_interest]
driv_dist_2013 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2013,var_of_interest]
driv_dist_2014 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2014,var_of_interest]
driv_dist_2015 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2015,var_of_interest]
driv_dist_2016 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2016,var_of_interest]
driv_dist_2017 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2017,var_of_interest]
driv_dist_2018 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2018,var_of_interest]

driv_data_boxplot_to_plot = [driv_dist_2010, driv_dist_2011, driv_dist_2012,
                             driv_dist_2013, driv_dist_2014, driv_dist_2015,
                             driv_dist_2016, driv_dist_2017, driv_dist_2018]

# Create figure and axes
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

# Create the boxplot
boxplot = ax.boxplot(driv_data_boxplot_to_plot, flierprops=dict(markerfacecolor='r', marker='o', markersize=8.0))
ax.set_xticklabels([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])
ax.set(title = var_of_interest,
       xlabel = "Season",
       ylabel = "Yards")
plt.show()

In [None]:
# DRIVING DISTANCE ONE-WAY ANOVA
# Our null hypothesis states that there are equal means in the populations from which the groups of data were sampled

print(st.f_oneway(driv_dist_2010, driv_dist_2011, driv_dist_2012,
                  driv_dist_2013, driv_dist_2014, driv_dist_2015,
                  driv_dist_2016, driv_dist_2017, driv_dist_2018))

print("P-value is < 0.05, therefore there are significant differences among the means")

In [None]:
m_comp = pairwise_tukeyhsd(endog=cln_all_pga_data_df["Driving Distance - (AVG.)"], groups=cln_all_pga_data_df["Season"], alpha=0.05)
print(m_comp)

In [None]:
# DENSITY PLOT FOR AVERAGE DRIVING DISTANCE

var_of_interest = "Driving Distance - (AVG.)"
season_lst = cln_all_pga_data_df["Season"].unique()

fig = plt.figure(figsize=(10,8))

for season in season_lst:
    driv_dist_season = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==season, var_of_interest]
    
    sns.distplot(driv_dist_season, hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = season)
    
# Plot formatting
plt.legend(prop={'size': 10}, title = 'Season');
plt.title("Density Plot of Average Driving Distance");
plt.xlabel("Yards");
plt.ylabel("Density");

In [None]:
# If we need these, this is how to split our dataframe into groups

# Create dataframes for groups of 3 years
cln_2010_2012_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2010", "2011", "2012"]),:]
cln_2010_2012_df.reset_index(inplace=True)

cln_2010_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2010"]),:]
cln_2010_df.reset_index(inplace=True)

cln_2013_2015_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2013", "2014", "2015"]),:]
cln_2013_2015_df.reset_index(inplace=True)

cln_2016_2018_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2016", "2017", "2018"]),:]
cln_2016_2018_df.reset_index(inplace=True)

# Create dataframes for groups of 4 years
cln_2011_2014_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2011", "2012", "2013", "2014"]),:]
cln_2011_2014_df.reset_index(inplace=True)

cln_2015_2018_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2015", "2016", "2017", "2018"]),:]
cln_2015_2018_df.reset_index(inplace=True)

cln_2010_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2010"]),:]
cln_2010_df.reset_index(inplace=True)
cln_2011_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2011"]),:]
cln_2011_df.reset_index(inplace=True)
cln_2012_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2012"]),:]
cln_2012_df.reset_index(inplace=True)
cln_2013_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2013"]),:]
cln_2013_df.reset_index(inplace=True)
cln_2014_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2014"]),:]
cln_2014_df.reset_index(inplace=True)
cln_2015_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2015"]),:]
cln_2015_df.reset_index(inplace=True)
cln_2016_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2016"]),:]
cln_2016_df.reset_index(inplace=True)
cln_2017_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2017"]),:]
cln_2017_df.reset_index(inplace=True)
cln_2018_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"].isin(["2018"]),:]
cln_2018_df.reset_index(inplace=True)

In [None]:

fig1, ((pl0t1, pl0t2), (pl0t3,pl0t4)) = plt.subplots(2, 2)
# fig.suptitle("something random", fontsize=16, fontweight="bold")


pl0t1.scatter(cln_2010_df.iloc[:,4], cln_2010_df.iloc[:,5], marker=".", edgecolors="red")
# pl0t1.plot(cln_2010_df.iloc[:,4], cln_2010_df.iloc[:,5], "b--", linewidth=1)
pl0t1.set_title("2010 stats")

pl0t2.scatter(cln_2011_df.iloc[:,4], cln_2011_df.iloc[:,5], marker=".", edgecolors="red")
pl0t2.set_title("2011 stats")

pl0t3.scatter(cln_2012_df.iloc[:,4], cln_2012_df.iloc[:,5], marker=".", edgecolors="red")
pl0t3.set_title("2012 stats")

pl0t4.scatter(cln_2013_df.iloc[:,4], cln_2013_df.iloc[:,5], marker=".", edgecolors="red")
pl0t4.set_title("2013 stats")
plt.xlabel("Driving Distance - AVG")
plt.ylabel("TOTAL DRIVES")

plt.tight_layout()

In [None]:
fig2, ((pl00t1, pl00t2), (pl00t3,pl00t4)) = plt.subplots(2, 2)
# fig.suptitle("something random", fontsize=16, fontweight="bold")


pl00t1.scatter(cln_2014_df.iloc[:,4], cln_2014_df.iloc[:,5], marker=".", edgecolors="red")
pl00t1.set_title("2014 stats")

pl00t2.scatter(cln_2015_df.iloc[:,4], cln_2015_df.iloc[:,5], marker=".", edgecolors="red")
pl00t2.set_title("2015 stats")

pl00t3.scatter(cln_2016_df.iloc[:,4], cln_2016_df.iloc[:,5], marker=".", edgecolors="red")
pl00t3.set_title("2016 stats")

pl00t4.scatter(cln_2017_df.iloc[:,4], cln_2017_df.iloc[:,5], marker=".", edgecolors="red")
pl00t4.set_title("2017 stats")
plt.xlabel("Driving Distance - AVG")
plt.ylabel("TOTAL DRIVES")

plt.tight_layout()

## Analyze Smash Factor

In [None]:
# BOXPLOT FOR SMASH FACTOR

var_of_interest = "Smash Factor - (AVG.)"

smash_2010 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2010,var_of_interest]
smash_2011 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2011,var_of_interest]
smash_2012 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2012,var_of_interest]
smash_2013 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2013,var_of_interest]
smash_2014 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2014,var_of_interest]
smash_2015 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2015,var_of_interest]
smash_2016 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2016,var_of_interest]
smash_2017 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2017,var_of_interest]
smash_2018 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2018,var_of_interest]

smash_data_boxplot_to_plot = [smash_2010, smash_2011, smash_2012,
                             smash_2013, smash_2014, smash_2015,
                             smash_2016, smash_2017, smash_2018]

# Create figure and axes
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

# Create the boxplot
boxplot = ax.boxplot(smash_data_boxplot_to_plot, flierprops=dict(markerfacecolor='r', marker='o', markersize=8.0))
ax.set_xticklabels([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])
ax.set(title = var_of_interest,
       xlabel = "Season",
       ylabel = "Smash Factor")
plt.show()

In [None]:
# SMASH FACTOR ONE-WAY ANOVA
# Our null hypothesis states that there are equal means in the populations from which the groups of data were sampled

print(st.f_oneway(smash_2010, smash_2011, smash_2012,
                  smash_2013, smash_2014, smash_2015,
                  smash_2016, smash_2017, smash_2018))

print("P-value is < 0.05, therefore there are significant differences among the means")

In [None]:
m_comp = pairwise_tukeyhsd(endog=cln_all_pga_data_df["Smash Factor - (AVG.)"], groups=cln_all_pga_data_df["Season"], alpha=0.05)
print(m_comp)

In [None]:
# DENSITY PLOT FOR SMASH FACTOR

var_of_interest = "Smash Factor - (AVG.)"
season_lst = cln_all_pga_data_df["Season"].unique()

fig = plt.figure(figsize=(10,8))

for season in season_lst:
    smash_season = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==season, var_of_interest]
    
    sns.distplot(smash_season, hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = season)
    
# Plot formatting
plt.legend(prop={'size': 10}, title = 'Season');
plt.title("Density Plot of Smash Factor");
plt.xlabel("Smash Factor");
plt.ylabel("Density");

## Analyze Short Game Rating - (RATING)

In [None]:
# BOXPLOT FOR SHORT GAME RATING

var_of_interest = "Short Game Rating - (RATING)"

short_gam_rat_2010 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2010,var_of_interest]
short_gam_rat_2011 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2011,var_of_interest]
short_gam_rat_2012 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2012,var_of_interest]
short_gam_rat_2013 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2013,var_of_interest]
short_gam_rat_2014 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2014,var_of_interest]
short_gam_rat_2015 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2015,var_of_interest]
short_gam_rat_2016 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2016,var_of_interest]
short_gam_rat_2017 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2017,var_of_interest]
short_gam_rat_2018 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2018,var_of_interest]

short_gam_rat_data_boxplot_to_plot = [short_gam_rat_2010, short_gam_rat_2011, short_gam_rat_2012,
                             short_gam_rat_2013, short_gam_rat_2014, short_gam_rat_2015,
                             short_gam_rat_2016, short_gam_rat_2017, short_gam_rat_2018]

# Create figure and axes
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

# Create the boxplot
boxplot = ax.boxplot(short_gam_rat_data_boxplot_to_plot, flierprops=dict(markerfacecolor='r', marker='o', markersize=8.0))
ax.set_xticklabels([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])
ax.set(title = var_of_interest,
       xlabel = "Season",
       ylabel = "Short Game Rating")
plt.show()

In [None]:
# SHORT GAME RATING ONE-WAY ANOVA
# Our null hypothesis states that there are equal means in the populations from which the groups of data were sampled

print(st.f_oneway(short_gam_rat_2010, short_gam_rat_2011, short_gam_rat_2012,
                  short_gam_rat_2013, short_gam_rat_2014, short_gam_rat_2015,
                  short_gam_rat_2016, short_gam_rat_2017, short_gam_rat_2018))

print("P-value is >= 0.05, therefore there are NO significant differences among the means")

In [None]:
m_comp = pairwise_tukeyhsd(endog=cln_all_pga_data_df["Short Game Rating - (RATING)"], groups=cln_all_pga_data_df["Season"], alpha=0.05)
print(m_comp)

In [None]:
# DENSITY PLOT FOR SHORT GAME RATING

var_of_interest = "Short Game Rating - (RATING)"
season_lst = cln_all_pga_data_df["Season"].unique()

fig = plt.figure(figsize=(10,8))

for season in season_lst:
    short_gam_season = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==season, var_of_interest]
    
    sns.distplot(short_gam_season, hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = season)
    
# Plot formatting
plt.legend(prop={'size': 10}, title = 'Season');
plt.title("Density Plot of Short Game Rating");
plt.xlabel("Short Game Rating");
plt.ylabel("Density");

## Analyze Proximity to Hole (ARG) - (AVG DTP) (IN)

In [None]:
# BOXPLOT FOR PROXIMITY TO HOLE (ARG)

var_of_interest = "Proximity to Hole (ARG) - (AVG DTP) (IN)"

prox_hole_2010 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2010,var_of_interest]
prox_hole_2011 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2011,var_of_interest]
prox_hole_2012 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2012,var_of_interest]
prox_hole_2013 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2013,var_of_interest]
prox_hole_2014 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2014,var_of_interest]
prox_hole_2015 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2015,var_of_interest]
prox_hole_2016 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2016,var_of_interest]
prox_hole_2017 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2017,var_of_interest]
prox_hole_2018 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2018,var_of_interest]

prox_hole_data_boxplot_to_plot = [prox_hole_2010, prox_hole_2011, prox_hole_2012,
                             prox_hole_2013, prox_hole_2014, prox_hole_2015,
                             prox_hole_2016, prox_hole_2017, prox_hole_2018]

# Create figure and axes
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

# Create the boxplot
boxplot = ax.boxplot(prox_hole_data_boxplot_to_plot, flierprops=dict(markerfacecolor='r', marker='o', markersize=8.0))
ax.set_xticklabels([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])
ax.set(title = var_of_interest,
       xlabel = "Season",
       ylabel = "Proximity to Hole (Inches)")
plt.show()

In [None]:
# PROXIMITY TO HOLE ONE-WAY ANOVA
# Our null hypothesis states that there are equal means in the populations from which the groups of data were sampled

print(st.f_oneway(prox_hole_2010, prox_hole_2011, prox_hole_2012,
                  prox_hole_2013, prox_hole_2014, prox_hole_2015,
                  prox_hole_2016, prox_hole_2017, prox_hole_2018))

print("P-value is < 0.05, therefore there are significant differences among the means")

In [None]:
m_comp = pairwise_tukeyhsd(endog=cln_all_pga_data_df["Proximity to Hole (ARG) - (AVG DTP) (IN)"], groups=cln_all_pga_data_df["Season"], alpha=0.05)
print(m_comp)

In [None]:
# DENSITY PLOT FOR PROXIMITY TO HOLE (ARG)

var_of_interest = "Proximity to Hole (ARG) - (AVG DTP) (IN)"
season_lst = cln_all_pga_data_df["Season"].unique()

fig = plt.figure(figsize=(10,8))

for season in season_lst:
    prox_hole_season = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==season, var_of_interest]
    
    sns.distplot(prox_hole_season, hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = season)
    
# Plot formatting
plt.legend(prop={'size': 10}, title = 'Season');
plt.title("Density Plot of Proximity to Hole");
plt.xlabel("Proximity to Hole (Inches)");
plt.ylabel("Density");

## Analyze Putting Average

In [None]:
# BOXPLOT FOR PUTTING AVERAGE

var_of_interest = "Putting Average - (AVG)"

putt_avg_2010 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2010,var_of_interest]
putt_avg_2011 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2011,var_of_interest]
putt_avg_2012 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2012,var_of_interest]
putt_avg_2013 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2013,var_of_interest]
putt_avg_2014 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2014,var_of_interest]
putt_avg_2015 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2015,var_of_interest]
putt_avg_2016 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2016,var_of_interest]
putt_avg_2017 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2017,var_of_interest]
putt_avg_2018 = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==2018,var_of_interest]

putt_avg_data_boxplot_to_plot = [putt_avg_2010, putt_avg_2011, putt_avg_2012,
                                 putt_avg_2013, putt_avg_2014, putt_avg_2015,
                                 putt_avg_2016, putt_avg_2017, putt_avg_2018]

# Create figure and axes
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

# Create the boxplot
boxplot = ax.boxplot(putt_avg_data_boxplot_to_plot, flierprops=dict(markerfacecolor='r', marker='o', markersize=8.0))
ax.set_xticklabels([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])
ax.set(title = var_of_interest,
       xlabel = "Season",
       ylabel = "Putting Average")
plt.show()

In [None]:
# PUTTING AVERAGE TO HOLE ONE-WAY ANOVA
# Our null hypothesis states that there are equal means in the populations from which the groups of data were sampled

print(st.f_oneway(putt_avg_2010, putt_avg_2011, putt_avg_2012,
                  putt_avg_2013, putt_avg_2014, putt_avg_2015,
                  putt_avg_2016, putt_avg_2017, putt_avg_2018))

print("P-value is < 0.05, therefore there are significant differences among the means")

In [None]:
m_comp = pairwise_tukeyhsd(endog=cln_all_pga_data_df["Putting Average - (AVG)"], groups=cln_all_pga_data_df["Season"], alpha=0.05)
print(m_comp)

In [None]:
# DENSITY PLOT FOR PUTTING AVERAGE

var_of_interest = "Putting Average - (AVG)"
season_lst = cln_all_pga_data_df["Season"].unique()

fig = plt.figure(figsize=(10,8))

for season in season_lst:
    putt_avg_season = cln_all_pga_data_df.loc[cln_all_pga_data_df["Season"]==season, var_of_interest]
    
    sns.distplot(putt_avg_season, hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = season)
    
# Plot formatting
plt.legend(prop={'size': 10}, title = 'Season');
plt.title("Density Plot of Putting Average");
plt.xlabel("Putting Average");
plt.ylabel("Density");

## Scatter Matrix - Look for any obvious correlations and trends

In [None]:
pd.plotting.scatter_matrix(cln_all_pga_data_df[["Driving Distance - (AVG.)",
                                                "Smash Factor - (AVG.)",
                                                "Short Game Rating - (RATING)",
                                                "Proximity to Hole (ARG) - (AVG DTP) (IN)",
                                                "Putting Average - (AVG)"]]
                                                , figsize = (20, 20));


In [None]:
corr_data_df = cln_all_pga_data_df.copy()
corr_data_df = corr_data_df[["Driving Distance - (AVG.)",
                             "Smash Factor - (AVG.)",
                             "Short Game Rating - (RATING)",
                             "Proximity to Hole (ARG) - (AVG DTP) (IN)",
                             "Putting Average - (AVG)"]]
corr_matrix = corr_data_df.corr().apply(lambda x: round(x, 3))
corr_matrix.style.background_gradient(cmap= 'viridis' , axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '12pt'})\
    .set_precision(2)


In [None]:
def statistic_ave_top_bottom_over_seasons(base_df, col_name, title ='', ylabel = '', format_prefix = '', format_set='{:,.0f}'):
    just_players_all=base_df.copy()
    just_players_all=just_players_all.drop_duplicates(subset="Player Name", keep="first") 
    player_season_groups=just_players_all.groupby(['Season'])
    
        # Divison by Seasons
    season_years_df = pd.DataFrame([2010,2011,2012,2013,2014,2015,2016,2017,2018])
    Full_Season_2010 = base_df.loc[base_df["Season"]==2010]
    Full_Season_2011 = base_df.loc[base_df["Season"]==2011]
    Full_Season_2012 = base_df.loc[base_df["Season"]==2012]
    Full_Season_2013 = base_df.loc[base_df["Season"]==2013]
    Full_Season_2014 = base_df.loc[base_df["Season"]==2014]
    Full_Season_2015 = base_df.loc[base_df["Season"]==2015]
    Full_Season_2016 = base_df.loc[base_df["Season"]==2016]
    Full_Season_2017 = base_df.loc[base_df["Season"]==2017]
    Full_Season_2018 = base_df.loc[base_df["Season"]==2018]

     # Top and Bottom 10 Players for each season by the statistic in the column name
    ave_by_season_all=player_season_groups[col_name].mean()
    ave_by_season_all_df = pd.DataFrame(ave_by_season_all)

    Full_Season_2010=Full_Season_2010.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2010 = Full_Season_2010.iloc[0:9]
    Full_Season_2010=Full_Season_2010.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2010 = Full_Season_2010.iloc[0:9]

    Full_Season_2011=Full_Season_2011.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2011 = Full_Season_2011.iloc[0:9]
    Full_Season_2011=Full_Season_2011.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2011 = Full_Season_2011.iloc[0:9]

    Full_Season_2012=Full_Season_2012.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2012 = Full_Season_2012.iloc[0:9]
    Full_Season_2012=Full_Season_2012.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2012 = Full_Season_2012.iloc[0:9]

    Full_Season_2013=Full_Season_2013.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2013 = Full_Season_2013.iloc[0:9]
    Full_Season_2013=Full_Season_2013.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2013 = Full_Season_2013.iloc[0:9]

    Full_Season_2014=Full_Season_2014.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2014 = Full_Season_2014.iloc[0:9]
    Full_Season_2014=Full_Season_2014.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2014 = Full_Season_2014.iloc[0:9]

    Full_Season_2015=Full_Season_2015.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2015 = Full_Season_2015.iloc[0:9]
    Full_Season_2015=Full_Season_2015.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2015 = Full_Season_2015.iloc[0:9]

    Full_Season_2016=Full_Season_2016.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2016 = Full_Season_2016.iloc[0:9]
    Full_Season_2016=Full_Season_2016.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2016 = Full_Season_2016.iloc[0:9]

    Full_Season_2017=Full_Season_2017.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2017 = Full_Season_2017.iloc[0:9]
    Full_Season_2017=Full_Season_2017.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2017 = Full_Season_2017.iloc[0:9]

    Full_Season_2018=Full_Season_2018.sort_values([col_name, "Season"], ascending = (False, True)).reset_index(drop=True)
    Top_10_Season_2018 = Full_Season_2018.iloc[0:9]
    Full_Season_2018=Full_Season_2018.sort_values([col_name, "Season"], ascending = (True, True))
    Bottom_10_Season_2018 = Full_Season_2018.iloc[0:9]
    Top_10_by_season = pd.DataFrame([Top_10_Season_2010[col_name].mean(), 
                   Top_10_Season_2011[col_name].mean(), 
                   Top_10_Season_2012[col_name].mean(), 
                   Top_10_Season_2013[col_name].mean(), 
                   Top_10_Season_2014[col_name].mean(), 
                   Top_10_Season_2015[col_name].mean(), 
                   Top_10_Season_2016[col_name].mean(), 
                   Top_10_Season_2017[col_name].mean(), 
                   Top_10_Season_2018[col_name].mean()])

    Bottom_10_by_season = pd.DataFrame([Bottom_10_Season_2010[col_name].mean(), 
                   Bottom_10_Season_2011[col_name].mean(), 
                   Bottom_10_Season_2012[col_name].mean(), 
                   Bottom_10_Season_2013[col_name].mean(), 
                   Bottom_10_Season_2014[col_name].mean(), 
                   Bottom_10_Season_2015[col_name].mean(), 
                   Bottom_10_Season_2016[col_name].mean(), 
                   Bottom_10_Season_2017[col_name].mean(), 
                   Bottom_10_Season_2018[col_name].mean()])

    #Merging data frames for All Players, Top 10, and Bottom 10 
     #season_years_df is from above
    ave_by_season_all_df=ave_by_season_all_df.reset_index(drop=True)

    ave_line_graph=pd.merge(season_years_df, ave_by_season_all_df, right_index=True, left_index=True, suffixes=("A", "B"))
    ave_line_graph=pd.merge(ave_line_graph, Top_10_by_season, right_index=True, left_index=True, suffixes=("C", "D"))
    ave_line_graph=pd.merge(ave_line_graph, Bottom_10_by_season, right_index=True, left_index=True, suffixes=("E", "F"))

    # Rename Columns
    ave_line_graph.columns = ['Season', 'All Players', 'Top 10 Players', 'Bottom 10 Players']

    #Add index with years
    ave_line_graph.set_index('Season', inplace = True)


    ax= sns.lineplot(data=ave_line_graph, markers= True)
    ax.set(xlabel='Season', ylabel=ylabel, title=title)
    #changing ylables ticks
    y_value=[format_prefix + format_set.format(x) for x in ax.get_yticks()]
    ax.set_yticklabels(y_value)
    ax.legend(loc='center right', bbox_to_anchor=(1.45, 0.5), ncol=1)


In [None]:
statistic_ave_top_bottom_over_seasons(cln_all_pga_data_df, "Total Money (Official and Unofficial) - (MONEY)", ylabel='Average Earnings', title='Average Money Per Season for PGA Players', format_prefix = "$")

In [None]:
statistic_ave_top_bottom_over_seasons(cln_all_pga_data_df, "Proximity to Hole (ARG) - (AVG DTP) (IN)", title='Average Proximity to Hole (ARG) Per Season for PGA Players', ylabel='''Average of Each Player's Proximity to Hole (ARG) in Inches''', format_prefix = "")

In [None]:
statistic_ave_top_bottom_over_seasons(cln_all_pga_data_df, "Driving Distance - (AVG.)", ylabel='''Average of Each Player's Average Drive''', title='Average Drive Per Season for PGA Players', format_prefix = "")

In [None]:
statistic_ave_top_bottom_over_seasons(cln_all_pga_data_df, "Smash Factor - (AVG.)", ylabel='Smash Factor', title='Average Smash Factor Per Season for PGA Players', format_prefix = "", format_set='{:,.2f}')

In [None]:
statistic_ave_top_bottom_over_seasons(cln_all_pga_data_df, "Putting Average - (AVG)", ylabel='Putting Average', title='Average Putting Score Per Season for PGA Players', format_prefix = "", format_set='{:,.2f}')

In [None]:
#This method sets up the linear regression charts and saves a PNG of each
def linear_regression(base_df,col_x, col_y, title):
    
    just_players_all=base_df.copy()
    just_players_all=just_players_all.drop_duplicates(subset="Player Name", keep="first") 
    
    # This performs a linear regression on the two columns
    t_slope, t_intercept, t_r, t_p, t_std_err = linregress(just_players_all[col_x],just_players_all[col_y])

    # Create equation of line to calculate from linear regression
    t_fit = t_slope * just_players_all[col_x] + t_intercept

    #Makes scatterplot
    plt.scatter(just_players_all[col_x],just_players_all[col_y])
    
    #Makes Regression Line
    plt.suptitle(title, fontsize=16, fontweight="bold")
    plt.plot(just_players_all[col_x], t_fit, "--",color="red")
    plt.xlabel(col_x)
    plt.ylabel(col_y)
    plt.show()
    
    #prints r-squared value
    print(f"The r-squared for the chart above is: {t_r*t_r}")

In [None]:
linear_regression(cln_all_pga_data_df, "Total Money (Official and Unofficial) - (MONEY)","Putting Average - (AVG)", "Correlation of Money Earned versus Putting Average")

In [None]:
linear_regression(cln_all_pga_data_df, "Total Money (Official and Unofficial) - (MONEY)", "Proximity to Hole (ARG) - (AVG DTP) (IN)", "Correlation of Money Earned versus Proximity to Hole (ARG)")

In [None]:
linear_regression(cln_all_pga_data_df, "Total Money (Official and Unofficial) - (MONEY)", "Smash Factor - (AVG.)", "Correlation of Money Earned versus Smash Factor")

In [None]:
linear_regression(cln_all_pga_data_df, "Total Money (Official and Unofficial) - (MONEY)", "Driving Distance - (AVG.)", "Correlation of Money Earned versus Driving Distance")

## Examine 3 players who played all 9 seasons 2010-2018

In [None]:
seasons = list(cln_all_pga_data_df["Season"].unique())
phil_data_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Player Name"] == "Phil Mickelson"]
jj_data_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Player Name"] == "J.J. Henry"]
dustin_data_df = cln_all_pga_data_df.loc[cln_all_pga_data_df["Player Name"] == "Dustin Johnson"]

fig = plt.figure(figsize=(30,15))
ax1 = fig.add_subplot(231)
ax2 = fig.add_subplot(232)
ax3 = fig.add_subplot(233)
ax4 = fig.add_subplot(234)
ax5 = fig.add_subplot(235)
ax6 = fig.add_subplot(236)

ax1.plot(seasons, phil_data_df["Driving Distance - (AVG.)"], label="Phil Mickelson")
ax1.plot(seasons, jj_data_df["Driving Distance - (AVG.)"], label="J.J. Henry")
ax1.plot(seasons, dustin_data_df["Driving Distance - (AVG.)"], label="Dustin Johnson")
ax1.set(title = "Driving Distance - (AVG.)",
       xlabel = "Season",
       ylabel = "Yards")
ax1.legend(loc="best")
ax1.grid()

ax2.plot(seasons, phil_data_df["Smash Factor - (AVG.)"], label="Phil Mickelson")
ax2.plot(seasons, jj_data_df["Smash Factor - (AVG.)"], label="J.J. Henry")
ax2.plot(seasons, dustin_data_df["Smash Factor - (AVG.)"], label="Dustin Johnson")
ax2.set(title = "Smash Factor - (AVG.)",
       xlabel = "Season",
       ylabel = "Smash Factor")
ax2.legend(loc="best")
ax2.grid()

ax3.plot(seasons, phil_data_df["Short Game Rating - (RATING)"], label="Phil Mickelson")
ax3.plot(seasons, jj_data_df["Short Game Rating - (RATING)"], label="J.J. Henry")
ax3.plot(seasons, dustin_data_df["Short Game Rating - (RATING)"], label="Dustin Johnson")
ax3.set(title = "Short Game Rating - (RATING)",
       xlabel = "Season",
       ylabel = "Short Game Rating")
ax3.legend(loc="best")
ax3.grid()

ax4.plot(seasons, phil_data_df["Proximity to Hole (ARG) - (AVG DTP) (IN)"], label="Phil Mickelson")
ax4.plot(seasons, jj_data_df["Proximity to Hole (ARG) - (AVG DTP) (IN)"], label="J.J. Henry")
ax4.plot(seasons, dustin_data_df["Proximity to Hole (ARG) - (AVG DTP) (IN)"], label="Dustin Johnson")
ax4.set(title = "Proximity to Hole (ARG) - (AVG DTP) (IN)",
       xlabel = "Season",
       ylabel = "Inches")
ax4.legend(loc="best")
ax4.grid()

ax5.plot(seasons, phil_data_df["Putting Average - (AVG)"], label="Phil Mickelson")
ax5.plot(seasons, jj_data_df["Putting Average - (AVG)"], label="J.J. Henry")
ax5.plot(seasons, dustin_data_df["Putting Average - (AVG)"], label="Dustin Johnson")
ax5.set(title = "Putting Average - (AVG)",
       xlabel = "Season",
       ylabel = "Putts")
ax5.legend(loc="best")
ax5.grid()

ax6.plot(seasons, phil_data_df["Total Money (Official and Unofficial) - (MONEY)"], label="Phil Mickelson")
ax6.plot(seasons, jj_data_df["Total Money (Official and Unofficial) - (MONEY)"], label="J.J. Henry")
ax6.plot(seasons, dustin_data_df["Total Money (Official and Unofficial) - (MONEY)"], label="Dustin Johnson")
ax6.set(title = "Total Money (Official and Unofficial) - (MONEY)",
       xlabel = "Season",
       ylabel = "Total Money")
ax6.legend(loc="best")
ax6.grid()

plt.show()