In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib as mpt
import matplotlib.pyplot as plt
import seaborn as sns 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.gofplots import ProbPlot
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.linear_model import LinearRegression


In [2]:
# Making a dataframe of current Pros that compete in the open male division (MPO)
# and of the tournament we want to predict.

mpo = pd.read_csv('MPO Stats.csv')
mh21 = pd.read_csv('Maple Hill 21.csv')

In [3]:
mpo

Unnamed: 0,Name,Division,Events,Wins,Win %,Podiums,Podium %,Top 10,Top 10 %,Rounds,Cash Prizes,Cash Prize %,Throws,Cash/Throw,Total Earnings
0,Ezra Aderhold,MPO,69,7,10.14%,17,24.64%,31,44.93%,202,60,86.96%,11795,$3.14,"$37,016"
1,Josh Anthon,MPO,221,50,22.62%,100,45.25%,178,80.54%,851,210,95.02%,45887,$3.38,"$155,170"
2,Niklas Anttila,MPO,54,16,29.63%,29,53.70%,48,88.89%,162,31,57.41%,9440,$1.51,"$14,288"
3,Dion Arlyn,MPO,205,37,18.05%,86,41.95%,156,76.10%,618,165,80.49%,34528,$1.16,"$39,987"
4,Anthony Barela,MPO,157,29,18.47%,54,34.39%,101,64.33%,496,90,57.32%,28802,$1.96,"$56,530"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,Knut Valen Håland,MPO,67,22,32.84%,37,55.22%,55,82.09%,228,37,55.22%,13025,$1.38,"$17,987"
85,Dana Vicich,MPO,252,23,9.13%,69,27.38%,150,59.52%,705,173,68.65%,41830,$1.13,"$47,345"
86,Casey White,MPO,124,17,13.71%,40,32.26%,68,54.84%,343,81,65.32%,20129,$1.84,"$36,982"
87,Scott Withers,MPO,259,119,45.95%,173,66.80%,219,84.56%,721,249,96.14%,38687,$2.50,"$96,666"


In [4]:
mh21.head(10)

Unnamed: 0,Place,Points,Name,PDGA#,Rating,Par,Rd1,Rd2,Rd3,Total,Prize
0,1,1440,Adam Hammes,57365,1038,-22,54,53,51,158,"$7,500"
1,2,1430,Eagle Wynne McMahon,37817,1055,-19,51,52,58,161,"$3,350"
2,2,1430,Richard Wysocki,38008,1055,-19,54,54,53,161,"$3,350"
3,4,1410,Calvin Heimburg,45971,1049,-16,57,52,55,164,"$2,097"
4,4,1410,Matt Bell,48950,1022,-16,58,52,54,164,"$2,097"
5,4,1410,Matthew Orum,18330,1039,-16,55,53,56,164,"$2,097"
6,7,1380,Andrew Marwede,75590,1025,-14,55,55,56,166,"$1,750"
7,7,1380,Joel Freeman,69509,1030,-14,54,53,59,166,"$1,750"
8,9,1360,Benjamin Callaway,39015,1022,-13,55,54,58,167,"$1,550"
9,9,1360,Thomas Gilbert,85850,1022,-13,57,60,50,167,"$1,550"


In [5]:
# We want to try to predict who will be in the top 4.

mh21.loc[mh21.Place < 5]

Unnamed: 0,Place,Points,Name,PDGA#,Rating,Par,Rd1,Rd2,Rd3,Total,Prize
0,1,1440,Adam Hammes,57365,1038,-22,54,53,51,158,"$7,500"
1,2,1430,Eagle Wynne McMahon,37817,1055,-19,51,52,58,161,"$3,350"
2,2,1430,Richard Wysocki,38008,1055,-19,54,54,53,161,"$3,350"
3,4,1410,Calvin Heimburg,45971,1049,-16,57,52,55,164,"$2,097"
4,4,1410,Matt Bell,48950,1022,-16,58,52,54,164,"$2,097"
5,4,1410,Matthew Orum,18330,1039,-16,55,53,56,164,"$2,097"


In [6]:
# The names in our dataframes weren't all the same, so I made a new column with the 
# first initial and last name in each dataframe.

mposplitnames = mpo['Name'].str.split(' ')
mpo['last_name'] = mposplitnames.str[-1]
mpo['first_name'] = mposplitnames.str[0]
mposplitfirst = mpo['first_name'].str.split('')
mpo['first_inital'] = mposplitfirst.str[1]
mpo['Name2']=mpo['first_inital'] + '.' + ' ' + mpo['last_name']
mpo = mpo.drop(['last_name', 'first_name', 'first_inital'], axis = 1)

mhsplitnames = mh21['Name'].str.split(' ')
mh21['last_name'] = mhsplitnames.str[-1]
mh21['first_name'] = mhsplitnames.str[0]
mh21splitfirst = mh21['first_name'].str.split('')
mh21['first_inital'] = mh21splitfirst.str[1]
mh21['Name2']=mh21['first_inital'] + '.' + ' ' + mh21['last_name']
mh21 = mh21.drop(['last_name', 'first_name', 'first_inital'], axis = 1)

In [7]:
# Making a new dataframe with just the competetors of the tournament.

mh21merged0 = mpo.merge(mh21.Name2, how = 'inner', on = 'Name2')

In [8]:
# Adding 'Place' to the previous datframe in order to get correlations.
mh21merged1 = mh21merged0.merge(mh21, how = 'inner', on = 'Name2')
mh21merged1.columns

Index(['Name_x', 'Division', 'Events', 'Wins', 'Win %', 'Podiums', 'Podium %',
       'Top 10', 'Top 10 %', 'Rounds', 'Cash Prizes', 'Cash Prize %', 'Throws',
       'Cash/Throw', 'Total Earnings', 'Name2', 'Place', 'Points', 'Name_y',
       'PDGA#', 'Rating', 'Par', 'Rd1', 'Rd2', 'Rd3', 'Total', 'Prize'],
      dtype='object')

In [9]:
mh21merged1.head(10)

Unnamed: 0,Name_x,Division,Events,Wins,Win %,Podiums,Podium %,Top 10,Top 10 %,Rounds,...,Points,Name_y,PDGA#,Rating,Par,Rd1,Rd2,Rd3,Total,Prize
0,Ezra Aderhold,MPO,69,7,10.14%,17,24.64%,31,44.93%,202,...,1200,Ezra Aderhold,121715,1024,-7,61,60,52,173,$638
1,Gregg Barsby,MPO,431,74,17.17%,172,39.91%,286,66.36%,1495,...,1330,Gregg Barsby,15857,1024,-11,57,54,58,169,"$1,110"
2,Philo Brathwaite,MPO,337,38,11.28%,114,33.83%,196,58.16%,1135,...,990,Philo Brathwaite,26416,1021,-2,62,59,57,178,$355
3,Steve Brinster,MPO,408,107,26.23%,199,48.77%,313,76.72%,1360,...,1050,Steve Brinster,10628,1019,-3,61,59,57,177,$405
4,Lance Brown,MPO,281,18,6.41%,67,23.84%,174,61.92%,859,...,1280,Lance Brown,58644,995,-10,59,57,54,170,$844
5,Benjamin Callaway,MPO,176,27,15.34%,53,30.11%,113,64.20%,528,...,1360,Benjamin Callaway,39015,1022,-13,55,54,58,167,"$1,550"
6,AJ Carey,MPO,228,20,8.77%,60,26.32%,125,54.82%,662,...,880,AJ Carey,61770,1009,E,60,63,57,180,$300
7,Chris Clemons,MPO,172,30,17.44%,63,36.63%,99,57.56%,496,...,1200,Chris Clemons,50401,1028,-7,58,59,56,173,$638
8,James Conrad,MPO,251,31,12.35%,85,33.86%,182,72.51%,780,...,1240,James Conrad,17295,1037,-9,60,53,58,171,$750
9,Chris Dickerson,MPO,289,149,51.56%,211,73.01%,262,90.66%,821,...,1280,Chris Dickerson,62467,1045,-10,58,55,57,170,$844


In [10]:
# Removing unneeded columns.
mh21merged = mh21merged1.drop(['Points', 'Name_y', 'PDGA#', 'Rating', 'Par', 'Rd1', 'Rd2', 'Rd3', 'Total', 'Prize'], axis = 1)

In [11]:
correlations = mh21merged.corr().abs() # Absolute value to sort.
sort = correlations.unstack().sort_values(kind="quicksort", ascending = False).to_frame(name="corr").reset_index()
#unstack changes from matrix to pairs.
sort[sort.level_0 == 'Place']

  correlations = mh21merged.corr().abs() # Absolute value to sort.


Unnamed: 0,level_0,level_1,corr
5,Place,Place,1.0
27,Place,Wins,0.29343
29,Place,Podiums,0.290783
31,Place,Top 10,0.258999
33,Place,Cash Prizes,0.20588
35,Place,Events,0.133489


In [12]:
# Since Place is correlated the highest with these 4 variables, I'll sort by them.
# The ones at the top should be most likely to be in the top 4 of the tournament.
mh21merged0.sort_values(['Wins', 'Podiums', 'Top 10', 'Cash Prizes'], ascending=False).head(8)

Unnamed: 0,Name,Division,Events,Wins,Win %,Podiums,Podium %,Top 10,Top 10 %,Rounds,Cash Prizes,Cash Prize %,Throws,Cash/Throw,Total Earnings,Name2
9,Chris Dickerson,MPO,289,149,51.56%,211,73.01%,262,90.66%,821,266,92.04%,43666,$4.84,"$211,171",C. Dickerson
25,Cale Leiviska,MPO,406,142,34.98%,257,63.30%,347,85.47%,1246,399,98.28%,67423,$4.19,"$282,683",C. Leiviska
28,Paul McBeth,MPO,357,133,37.25%,244,68.35%,323,90.48%,1262,328,91.88%,68064,$8.16,"$555,450",P. McBeth
26,Nikko Locastro,MPO,446,122,27.35%,238,53.36%,355,79.60%,1522,408,91.48%,82732,$4.17,"$345,334",N. Locastro
39,Richard Wysocki,MPO,354,117,33.05%,235,66.38%,321,90.68%,1184,311,87.85%,64803,$7.19,"$465,868",R. Wysocki
37,Paul Ulibarri,MPO,515,107,20.78%,234,45.44%,394,76.50%,1649,488,94.76%,92465,$3.28,"$302,922",P. Ulibarri
3,Steve Brinster,MPO,408,107,26.23%,199,48.77%,313,76.72%,1360,381,93.38%,75767,$2.84,"$215,086",S. Brinster
22,Emerson Keith,MPO,299,82,27.42%,153,51.17%,230,76.92%,723,228,76.25%,39768,$2.73,"$108,542",E. Keith


We can see that only 1 of the competitors who got top 4 in the tournament is on our sorted list.

In [13]:
# Attempting with another tournament.

worlds21 = pd.read_csv('Worlds 21.csv')

In [14]:
# We want to try to predict who will be in the top 4.

worlds21.loc[worlds21.Place < 5]

Unnamed: 0,Place,Points,Name,PDGA#,Rating,Par,Rd1,Rd2,Rd3,Rd4,Finals,Total,Prize
0,1,3135,James Conrad,17295,1034,-39,50,56,52,54,54,266,"$16,500"
1,2,3120,Paul McBeth,27523,1051,-39,51,55,52,53,55,266,"$10,000"
2,3,3105,Nathan Sexton,18824,1033,-35,48,58,48,59,57,270,"$8,500"
3,4,3090,Kevin Jones,41760,1033,-33,49,57,51,55,60,272,"$6,500"
4,4,3090,Chris Dickerson,62467,1045,-33,53,54,53,54,58,272,"$6,500"


In [15]:
worlds21splitnames = worlds21['Name'].str.split(' ')
worlds21['last_name'] = worlds21splitnames.str[-1]
worlds21['first_name'] = worlds21splitnames.str[0]
worlds21splitfirst = worlds21['first_name'].str.split('')
worlds21['first_inital'] = worlds21splitfirst.str[1]
worlds21['Name2']=worlds21['first_inital'] + '.' + ' ' + worlds21['last_name']
worlds21 = worlds21.drop(['last_name', 'first_name', 'first_inital'], axis = 1)

In [16]:
# Making a new dataframe with just the competetors of the tournament.

worlds21merged0 = mpo.merge(worlds21.Name2, how = 'inner', on = 'Name2')

In [17]:
# Adding 'Place' to the previous datframe in order to get correlations.
worlds21merged1 = worlds21merged0.merge(worlds21, how = 'inner', on = 'Name2')
worlds21merged1.columns

Index(['Name_x', 'Division', 'Events', 'Wins', 'Win %', 'Podiums', 'Podium %',
       'Top 10', 'Top 10 %', 'Rounds', 'Cash Prizes', 'Cash Prize %', 'Throws',
       'Cash/Throw', 'Total Earnings', 'Name2', 'Place', 'Points', 'Name_y',
       'PDGA#', 'Rating', 'Par', 'Rd1', 'Rd2', 'Rd3', 'Rd4', 'Finals', 'Total',
       'Prize'],
      dtype='object')

In [18]:
# Removing unneeded columns.
worlds21merged = worlds21merged1.drop(['Points', 'Name_y', 'PDGA#', 'Rating', 'Par', 'Rd1', 'Rd2', 'Rd3', 'Rd4', 'Finals', 'Total', 'Prize'], axis = 1)

In [19]:
correlations = worlds21merged.corr().abs() # Absolute value to sort.
sort = correlations.unstack().sort_values(kind="quicksort", ascending = False).to_frame(name="corr").reset_index()
#unstack changes from matrix to pairs.
sort[sort.level_0 == 'Place']

  correlations = worlds21merged.corr().abs() # Absolute value to sort.


Unnamed: 0,level_0,level_1,corr
5,Place,Place,1.0
27,Place,Wins,0.41083
29,Place,Podiums,0.374304
31,Place,Top 10,0.314207
33,Place,Cash Prizes,0.234412
35,Place,Events,0.168963


In [20]:
# We see the same 4 variables.
worlds21merged0.sort_values(['Wins', 'Podiums', 'Top 10', 'Cash Prizes'], ascending=False).head(8)

Unnamed: 0,Name,Division,Events,Wins,Win %,Podiums,Podium %,Top 10,Top 10 %,Rounds,Cash Prizes,Cash Prize %,Throws,Cash/Throw,Total Earnings,Name2
10,Chris Dickerson,MPO,289,149,51.56%,211,73.01%,262,90.66%,821,266,92.04%,43666,$4.84,"$211,171",C. Dickerson
29,Cale Leiviska,MPO,406,142,34.98%,257,63.30%,347,85.47%,1246,399,98.28%,67423,$4.19,"$282,683",C. Leiviska
32,Paul McBeth,MPO,357,133,37.25%,244,68.35%,323,90.48%,1262,328,91.88%,68064,$8.16,"$555,450",P. McBeth
31,Nikko Locastro,MPO,446,122,27.35%,238,53.36%,355,79.60%,1522,408,91.48%,82732,$4.17,"$345,334",N. Locastro
50,Scott Withers,MPO,259,119,45.95%,173,66.80%,219,84.56%,721,249,96.14%,38687,$2.50,"$96,666",S. Withers
51,Richard Wysocki,MPO,354,117,33.05%,235,66.38%,321,90.68%,1184,311,87.85%,64803,$7.19,"$465,868",R. Wysocki
48,Paul Ulibarri,MPO,515,107,20.78%,234,45.44%,394,76.50%,1649,488,94.76%,92465,$3.28,"$302,922",P. Ulibarri
44,Nate Sexton,MPO,313,89,28.43%,174,55.59%,255,81.47%,1075,294,93.93%,58382,$3.52,"$205,465",N. Sexton


Here we got 3 of the 5 finalists.

In [21]:
# Let's try one more tournament.
gmc = pd.read_csv('GMC 21.csv')

In [22]:
# We want to try to predict who will be in the top 4.

gmc.loc[gmc.Place < 5]

Unnamed: 0,Place,Points,Name,PDGA#,Rating,Par,Rd1,Rd2,Rd3,Finals,Total,Prize
0,1,1340,Chris Dickerson,62467,1045,-41,52,51,48,56,207,"$6,000"
1,2,1330,Richard Wysocki,38008,1055,-38,53,52,51,54,210,"$3,500"
2,3,1320,Andrew Presnell,63765,1027,-30,52,54,51,61,218,"$2,165"
3,3,1320,Paul McBeth,27523,1053,-30,53,58,53,54,218,"$2,165"


In [23]:
gmcsplitnames = gmc['Name'].str.split(' ')
gmc['last_name'] = gmcsplitnames.str[-1]
gmc['first_name'] = gmcsplitnames.str[0]
gmcsplitfirst = gmc['first_name'].str.split('')
gmc['first_inital'] = gmcsplitfirst.str[1]
gmc['Name2']=gmc['first_inital'] + '.' + ' ' + gmc['last_name']
gmc = gmc.drop(['last_name', 'first_name', 'first_inital'], axis = 1)

In [24]:
# Making a new dataframe with just the competetors of the tournament.

gmcmerged0 = mpo.merge(gmc.Name2, how = 'inner', on = 'Name2')

In [25]:
# Adding 'Place' to the previous datframe in order to get correlations.
gmcmerged1 = gmcmerged0.merge(gmc, how = 'inner', on = 'Name2')
gmcmerged1.columns

Index(['Name_x', 'Division', 'Events', 'Wins', 'Win %', 'Podiums', 'Podium %',
       'Top 10', 'Top 10 %', 'Rounds', 'Cash Prizes', 'Cash Prize %', 'Throws',
       'Cash/Throw', 'Total Earnings', 'Name2', 'Place', 'Points', 'Name_y',
       'PDGA#', 'Rating', 'Par', 'Rd1', 'Rd2', 'Rd3', 'Finals', 'Total',
       'Prize'],
      dtype='object')

In [26]:
# Removing unneeded columns.
gmcmerged = gmcmerged1.drop(['Points', 'Name_y', 'PDGA#', 'Rating', 'Par', 'Rd1', 'Rd2', 'Rd3', 'Finals', 'Total', 'Prize'], axis = 1)

In [27]:
correlations = gmcmerged.corr().abs() # Absolute value to sort.
sort = correlations.unstack().sort_values(kind="quicksort", ascending = False).to_frame(name="corr").reset_index()
#unstack changes from matrix to pairs.
sort[sort.level_0 == 'Place']

  correlations = gmcmerged.corr().abs() # Absolute value to sort.


Unnamed: 0,level_0,level_1,corr
5,Place,Place,1.0
27,Place,Wins,0.383278
29,Place,Podiums,0.359664
31,Place,Top 10,0.318055
33,Place,Cash Prizes,0.248087
35,Place,Events,0.217677


In [28]:
# Again, we see the same 4 variables.
gmcmerged0.sort_values(['Wins', 'Podiums', 'Top 10', 'Cash Prizes'], ascending=False).head(8)

Unnamed: 0,Name,Division,Events,Wins,Win %,Podiums,Podium %,Top 10,Top 10 %,Rounds,Cash Prizes,Cash Prize %,Throws,Cash/Throw,Total Earnings,Name2
7,Chris Dickerson,MPO,289,149,51.56%,211,73.01%,262,90.66%,821,266,92.04%,43666,$4.84,"$211,171",C. Dickerson
22,Cale Leiviska,MPO,406,142,34.98%,257,63.30%,347,85.47%,1246,399,98.28%,67423,$4.19,"$282,683",C. Leiviska
26,Paul McBeth,MPO,357,133,37.25%,244,68.35%,323,90.48%,1262,328,91.88%,68064,$8.16,"$555,450",P. McBeth
24,Nikko Locastro,MPO,446,122,27.35%,238,53.36%,355,79.60%,1522,408,91.48%,82732,$4.17,"$345,334",N. Locastro
35,Richard Wysocki,MPO,354,117,33.05%,235,66.38%,321,90.68%,1184,311,87.85%,64803,$7.19,"$465,868",R. Wysocki
33,Paul Ulibarri,MPO,515,107,20.78%,234,45.44%,394,76.50%,1649,488,94.76%,92465,$3.28,"$302,922",P. Ulibarri
21,Jeremy Koling,MPO,388,73,18.81%,157,40.46%,281,72.42%,1314,345,88.92%,73429,$2.86,"$209,777",J. Koling
30,Matt Orum,MPO,290,71,24.48%,136,46.90%,218,75.17%,1015,266,91.72%,55254,$3.68,"$203,174",M. Orum


This time we got 3 of the top 4, but also first place.