In [1]:
# Import data manipulation modules
import pandas as pd
import os
import numpy as np
import seaborn as sns

# Import data visualization modules
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
from bs4 import BeautifulSoup

In [2]:
draft_df = pd.read_csv('./Resources/Draft_data.csv')
draft_df.head()

Unnamed: 0,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,AP1,PB,St,CarAV,G,GS,College/Univ
0,2021,1,1,Trevor Lawrence\LawrTr00,QB,21.0,JAX,2021,2021,0,0,0,0,9,9,Clemson
1,2021,1,2,Zach Wilson\WilsZa00,QB,22.0,NYJ,2021,2021,0,0,0,0,6,6,BYU
2,2021,1,3,Trey Lance\LancTr00,QB,21.0,SFO,2021,2021,0,0,0,0,4,1,North Dakota St.
3,2021,1,4,Kyle Pitts\PittKy00,TE,20.0,ATL,2021,2021,0,0,0,0,10,9,Florida
4,2021,1,5,Ja'Marr Chase\ChasJa00,WR,21.0,CIN,2021,2021,0,0,0,0,9,9,LSU


In [3]:
# Determine if there are any missing values in school data
draft_df.count()

Year            5512
Rnd             5512
Pick            5512
Player          5512
Pos             5512
DrAge           5511
Tm              5512
From            5512
To              5512
AP1             5512
PB              5512
St              5512
CarAV           5512
G               5512
GS              5512
College/Univ    5503
dtype: int64

In [4]:
# determine data type for DataFrame
draft_df.dtypes

Year              int64
Rnd               int64
Pick              int64
Player           object
Pos              object
DrAge           float64
Tm               object
From              int64
To                int64
AP1               int64
PB                int64
St                int64
CarAV             int64
G                 int64
GS                int64
College/Univ     object
dtype: object

In [5]:
draft_df.head(10)

Unnamed: 0,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,AP1,PB,St,CarAV,G,GS,College/Univ
0,2021,1,1,Trevor Lawrence\LawrTr00,QB,21.0,JAX,2021,2021,0,0,0,0,9,9,Clemson
1,2021,1,2,Zach Wilson\WilsZa00,QB,22.0,NYJ,2021,2021,0,0,0,0,6,6,BYU
2,2021,1,3,Trey Lance\LancTr00,QB,21.0,SFO,2021,2021,0,0,0,0,4,1,North Dakota St.
3,2021,1,4,Kyle Pitts\PittKy00,TE,20.0,ATL,2021,2021,0,0,0,0,10,9,Florida
4,2021,1,5,Ja'Marr Chase\ChasJa00,WR,21.0,CIN,2021,2021,0,0,0,0,9,9,LSU
5,2021,1,6,Jaylen Waddle\WaddJa00,WR,22.0,MIA,2021,2021,0,0,0,0,10,10,Alabama
6,2021,1,7,Penei Sewell\SewePe00,OL,20.0,DET,2021,2021,0,0,0,0,9,9,Oregon
7,2021,1,8,Jaycee Horn\HornJa00,DB,21.0,CAR,2021,2021,0,0,0,0,3,3,South Carolina
8,2021,1,9,Patrick Surtain II\SurtPa01,DB,21.0,DEN,2021,2021,0,0,0,0,10,9,Alabama
9,2021,1,10,DeVonta Smith\SmitDe07,WR,22.0,PHI,2021,2021,0,0,0,0,10,10,Alabama


In [6]:
#Change Column names
draft_df.rename(columns ={ 'Rnd': 'Round',
                          'Year': 'Draft Year',
                          'Pos':'Position',
                          'DrAge':'Draft Age',
                          'Tm':'Team',
                          'From':'Start Year',
                          'To':'End Year',
                          'AP1': "First Team All-Pro",
                          'PB':'Pro-Bowl Selections',
                          'St':"Starter Years",
                          'CarAV': 'Career Approxmiate Value',
                          'G':'Games Played',
                          'GS':"Games Started" }, inplace=True)
draft_df.head()

Unnamed: 0,Draft Year,Round,Pick,Player,Position,Draft Age,Team,Start Year,End Year,First Team All-Pro,Pro-Bowl Selections,Starter Years,Career Approxmiate Value,Games Played,Games Started,College/Univ
0,2021,1,1,Trevor Lawrence\LawrTr00,QB,21.0,JAX,2021,2021,0,0,0,0,9,9,Clemson
1,2021,1,2,Zach Wilson\WilsZa00,QB,22.0,NYJ,2021,2021,0,0,0,0,6,6,BYU
2,2021,1,3,Trey Lance\LancTr00,QB,21.0,SFO,2021,2021,0,0,0,0,4,1,North Dakota St.
3,2021,1,4,Kyle Pitts\PittKy00,TE,20.0,ATL,2021,2021,0,0,0,0,10,9,Florida
4,2021,1,5,Ja'Marr Chase\ChasJa00,WR,21.0,CIN,2021,2021,0,0,0,0,9,9,LSU


In [7]:
#create career length column 
draft_df["Career Length (Years)"] = draft_df["End Year"] - draft_df["Start Year"]
draft_df.tail(10)

Unnamed: 0,Draft Year,Round,Pick,Player,Position,Draft Age,Team,Start Year,End Year,First Team All-Pro,Pro-Bowl Selections,Starter Years,Career Approxmiate Value,Games Played,Games Started,College/Univ,Career Length (Years)
5502,2000,7,245,Danny Clark\ClarDa21,LB,23.0,JAX,2000,2010,0,0,8,39,168,102,Illinois,10
5503,2000,7,246,Leroy Fields,WR,0.0,DEN,2000,2000,0,0,0,0,0,0,Jackson St.,0
5504,2000,7,247,Mark Baniewicz,T,0.0,JAX,2000,2000,0,0,0,0,0,0,Syracuse,0
5505,2000,7,248,Lewis Kelly\KellLe21,T,23.0,MIN,2001,2005,0,0,0,4,18,5,South Carolina St.,4
5506,2000,7,249,Eugene McCaslin\McCaEu20,LB,23.0,GNB,2000,2000,0,0,0,0,1,0,Florida,0
5507,2000,7,250,Ethan Howell,WR,0.0,WAS,2000,2000,0,0,0,0,0,0,Oklahoma St.,0
5508,2000,7,251,DaShon Polk\PolkDa20,LB,23.0,BUF,2000,2006,0,0,1,10,95,16,Arizona,6
5509,2000,7,252,Rondell Mealey\MealRo00,RB,23.0,GNB,2001,2002,0,0,0,2,14,1,LSU,1
5510,2000,7,253,Alfonso Boone\BoonAl20,DT,24.0,DET,2001,2010,0,0,1,23,129,41,Mount San Antonio JC,9
5511,2000,7,254,Michael Green\GreeMi20,DB,23.0,CHI,2000,2008,0,0,2,22,104,48,Northwestern St. (LA),8


In [8]:
#Find how many positions there were in total
draft_df.Position.value_counts()

DB     798
WR     717
LB     594
DE     493
RB     454
DT     428
T      418
TE     322
G      289
QB     270
CB     189
C      138
S      128
OL      74
OLB     73
FB      48
ILB     34
DL      31
NT      14
Name: Position, dtype: int64

In [9]:
#replace and merge Positions
draft_df.loc[draft_df.Position =='NT',"Position"] = "DL"
draft_df.loc[draft_df.Position =='DE',"Position"] = "DL"
draft_df.loc[draft_df.Position =='DT',"Position"] = "DL"

In [10]:
#replace and merge Positions
draft_df.loc[draft_df.Position =='OLB',"Position"] = "LB"
draft_df.loc[draft_df.Position =='ILB',"Position"] = "LB"


In [11]:
#replace and merge Positions
draft_df.loc[draft_df.Position =='T',"Position"] = "OL"
draft_df.loc[draft_df.Position =='C',"Position"] = "OL"
draft_df.loc[draft_df.Position =='G',"Position"] = "OL"

In [22]:
draft_df.Position.value_counts()

DL    966
OL    919
DB    798
WR    717
LB    701
RB    454
TE    322
QB    270
CB    189
S     128
FB     48
Name: Position, dtype: int64

In [None]:
# get data for drafts from 2000 to 2010
#draft_df_2010 = draft_df.loc[draft_df.Draft_Yr <= 2010, :]

# Positional Dataframes

In [None]:
#QB data frame
QB_df = draft_df[draft_df["Position"] == "QB"]
QB_df.head()

In [None]:
#RB DataFrame
RB_df = draft_df[draft_df["Position"] == "RB"]
RB_df.head()

In [None]:
#FB dataframe
FB_df = draft_df[draft_df["Position"] == "FB"]
FB_df.head()

In [None]:
#WR Dataframe
WR_df = draft_df[draft_df["Position"] == "WR"]
WR_df.head()

In [None]:
#TE Dataframe
TE_df = draft_df[draft_df["Position"] == "TE"]
TE_df.head()

In [None]:
#OL Dataframe
OL_df = draft_df[draft_df["Position"] == "OL"]
OL_df.head()

In [None]:
#DL Dataframe
DL_df = draft_df[draft_df["Position"] == "DL"]
DL_df.head()

In [None]:
#DB Dataframe
DB_df = draft_df[draft_df["Position"] == "DB"]
DB_df.head()

In [None]:
#CB dataframe
CB_df = draft_df[draft_df["Position"] == "CB"]
CB_df.head()

In [None]:
#LB dataframe
LB_df = draft_df[draft_df["Position"] == "LB"]
LB_df.head()

In [None]:
#S dataframe
S_df = draft_df[draft_df["Position"] == "S"]
S_df.head()

# Graphs and Plots

In [None]:
# set some plotting styles
from matplotlib import rcParams

# set the font scaling and the plot sizes
sns.set(font_scale=1.65)
rcParams["figure.figsize"] = 12,9

In [None]:
# Use distplot to view the distribu
#sns.distplot(draft_df)
#plt.title("Distribution of Career Approximate Value")
#plt.xlim(-5,150)
#plt.show()

In [None]:
sns.boxplot(x="Position", y="Career Approxmiate Value", data=draft_df)
plt.title("Distribution of Career Approximate Value by Position (2000-2020)")
plt.show()

In [None]:
sns.boxplot(x="Position", y="Career Length (Years)", data=draft_df)
plt.title("Distribution of Career Length by Position (2000-2020)")
plt.show()

In [None]:
round_info = pd.crosstab(index = draft_df["Position"], columns=draft_df["Round"], margins=True)
round_info / round_info.loc['All']

In [None]:
#QB data 
sns.boxplot(x="Round", y="Career Length (Years)", data=QB_df)
plt.title("Distribution of Career Length by Position (2000-2020)")
plt.show()

In [None]:
QB_df.describe(include='all')

In [None]:
QB_df.groupby("Round").describe(include = "all")

In [None]:
# plot LOWESS curve
# set line color to be black, and scatter color to cyan
sns.regplot(x="Pick", y="Career Length (Years)", data=draft_df, lowess=True,
            line_kws={"color": "black"},
            scatter_kws={"color": sns.color_palette()[4], "alpha": 0.5})
plt.title("Career Approximate Value by Pick")
plt.xlim(-5, 300)
plt.ylim(0, 25)
plt.show()

In [None]:
lm = sns.lmplot(x="Round", y="Career Length (Years)", data=draft_df, lowess=True, col="Position",
                col_wrap=5, size=4, line_kws={"color": "black"},
                scatter_kws={"color": sns.color_palette()[5], "alpha": 0.7})

# add title to the plot (which is a FacetGrid)
# https://stackoverflow.com/questions/29813694/how-to-add-a-title-to-seaborn-facet-plot
plt.subplots_adjust(top=0.9)
lm.fig.suptitle("Career Approximate Value by Pick and Position",
                fontsize=30)

plt.xlim(0, 8)
plt.ylim(0, 25)
plt.show()

# QB Summary