In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import glob
from sklearn.preprocessing import LabelEncoder
from fancyimpute import KNN
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from zenrows import ZenRowsClient
client = ZenRowsClient("fa1a58c4dda65f20ad3ea8423dbe1b7ea3b0ced7")

# Data Scraping

In [None]:
def get_total_stats(webpage, stat):
    """
    Given a player's college stats page, returns the total stats for the player.
    """
    stat_html = webpage.select(f'td[data-stat="{stat}"]')
    if stat_html:
        return stat_html[-1].get_text()
    else:
        return None

In [None]:
# Loop over all years in the dataset.
current = False
START_YEAR = 2012
END_YEAR = 2023

STATS_LIST = [
    #Defense and Fumbles
    'tackles_solo',
    'tackles_assists',
    'tackles_total', 
    'tackles_loss',
    'sacks', 
    'def_int',
    'def_int_yds', 
    'def_int_td', 
    'pass_defended',
    'fumbles_rec', 
    'fumbles_rec_yds',
    'fumbles_rec_td',
    'fumbles_forced', 

    # Passing
    'pass_cmp', 
    'pass_att',
    'pass_cmp_pct',
    'pass_yds',
    'pass_td',
    'pass_int',
    'pass_rating',

    # Receiving & Rushing 
    'rec',
    'rec_yds',
    'rec_yds_per_rec',
    'rec_td',
    'rush_att',
    'rush_yds',
    'rush_yds_per_att',
    'rush_td',
    'scrim_att',
    'scrim_yds',
    'scrim_yds_per_att',
    'scrim_td',

    # Punt & Kick Returns 
    'punt_ret',
    'punt_ret_yds',
    'punt_ret_yds_per_ret',
    'punt_ret_td',
    'kick_ret',
    'kick_ret_yds',
    'kick_ret_yds_per_ret',
    'kick_ret_td'
    
    # Punting & Kicking
    'xpm',
    'xpa',
    'xp_pct',
    'fgm',
    'fga',
    'fg_pct',
    'kick_points',
    'punt',
    'punt_yds',
    'punt_yds_per_punt'
    ]


for year in range(START_YEAR, END_YEAR + 1):
    df = pd.DataFrame()
    print(year)
    # Read draft data
    url = f"https://www.pro-football-reference.com/draft/{year}-combine.htm"
    response = requests.get(url)
    webpage = BeautifulSoup(response.text, 'html.parser')
    # Get the names of the players
    names_html = webpage.select("tbody .left:nth-child(1)")
    all_names = [name.get_text() for name in names_html]
    names = [name for name in all_names if name != "Player"]
    num_players = len(names)

    # Get the position of the players
    pos_html = webpage.select("th+ td")
    pos = [pos.get_text() for pos in pos_html]
    pick = [0] * num_players
    round_ = [0] * num_players

    # Get draft data if this is not the current year.
    if not current:
        draft_html = webpage.select(".right+ .left")
        draft_info = [info.get_text() for info in draft_html]
        draft_info = ["Undrafted / 0th / 0th / 0" if info == "" else info for info in draft_info]
        draft_spots = [info.split(" / ") for info in draft_info]
        round_ = [int(spot[1][0]) for spot in draft_spots]
        pick = [int(''.join(filter(str.isdigit, spot[2]))) for spot in draft_spots]

    #Get school data
    college_elements = webpage.select('td.left + .left')
    college = [element.get_text() for element in college_elements]

    df["Name"] = names
    df["Position"] = pos
    df["College"] = college
    df["Round"] = round_
    df["Pick"] = pick
    
    # Get the links to the player's college stats
    stat_urls = []
    for link in webpage.select('td[data-stat="college"]'):
        if link.find('a'):
            stat_urls.append(link.find('a').get('href'))
        else:
            stat_urls.append(None)

    df["Stat URL"] = stat_urls

    # Get height
    height_html = webpage.select("td[data-stat='height']")
    height = [h.get_text() for h in height_html]
    height = [h.split("-") for h in height]
    new_height = []
    for h in height:
        if len(h) == 2:
            new_height.append((int(h[0]) * 12 + int(h[1])))
        else:
            new_height.append(math.nan)
    df["Height"] = new_height

    # Get weight
    weight_html = webpage.select("td[data-stat='weight']")
    weight = [w.get_text() for w in weight_html]
    weight = [int(w) if w != "" else math.nan for w in weight]
    df["Weight"] = weight

    # Get 40 yard dash
    forty_html = webpage.select("td[data-stat='forty_yd']")
    forty = [f.get_text() for f in forty_html]
    forty = [float(f) if f != "" else math.nan for f in forty]  
    df["40 Yard Dash"] = forty

    # Get bench press
    bench_html = webpage.select("td[data-stat='bench_reps']")
    bench = [b.get_text() for b in bench_html]
    bench = [int(b) if b != "" else math.nan for b in bench]
    df["Bench Press"] = bench

    # Get vertical jump
    vertical_html = webpage.select("td[data-stat='vertical']")
    vertical = [v.get_text() for v in vertical_html]
    vertical = [float(v) if v != "" else math.nan for v in vertical]
    df["Vertical Jump"] = vertical

    # Get broad jump
    broad_html = webpage.select("td[data-stat='broad_jump']")
    broad = [b.get_text() for b in broad_html]
    broad = [int(b) if b != "" else math.nan for b in broad]
    df["Broad Jump"] = broad

    # Get 3 cone drill
    cone_html = webpage.select("td[data-stat='cone']")
    cone = [c.get_text() for c in cone_html]
    cone = [float(c) if c != "" else math.nan for c in cone]
    df["3 Cone Drill"] = cone

    # Get shuttle
    shuttle_html = webpage.select("td[data-stat='shuttle']")
    shuttle = [s.get_text() for s in shuttle_html]
    shuttle = [float(s) if s != "" else math.nan for s in shuttle]
    df["Shuttle"] = shuttle

    df.dropna(subset=["Stat URL"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    urls  = df["Stat URL"]
    all_stats = {}
    
    for url in tqdm(urls):
        stats = {}
        response = client.get(url)
        webpage = BeautifulSoup(response.text, 'html.parser')
        
        # Get conference from stat page
        conf_html = webpage.select('td[data-stat="conf_abbr"]')
        if conf_html:
            conf = conf_html[0].get_text()
            stats['conf_abbr'] = conf
        else:
            stats['conf_abbr'] = None

        # Get games played and seasons played
        games_html = webpage.select('td[data-stat="g"]')
        
        if games_html:
            season = 0
            games_played = 0
            for game in games_html:
                if game.get_text() != "":
                    games_played += int(game.get_text())
                    season += 1
            
            stats['games'] = games_played
            stats['seasons'] = season
        else:
            stats['games'] = None
            stats['seasons'] = None

        # Get total stats
        for stat in STATS_LIST:
            stats[stat] = get_total_stats(webpage, stat)

        all_stats[url] = stats

    stat_df = pd.DataFrame(all_stats).T
    stat_df.index.name = "Stat URL"
    new_df = pd.merge(df, stat_df, on="Stat URL")
    new_df["Year"] = year
    new_df.to_csv(f"data/{year}.csv", index=False)


# Data Imputation

##### Combine all years data into one csv file called "combined_data.csv"

In [None]:
# Get a list of all csv files
csv_files = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

# Create an empty list to store the dataframes
dfs = []

# Loop over the list of csv files
for csv in csv_files:
    # Read each csv file into a DataFrame and append it to the list
    dfs.append(pd.read_csv('data/' + csv + '.csv'))

# Concatenate all dataframes in the list into one dataframe
df = pd.concat(dfs, ignore_index=True)

df.to_csv('data/combined_data.csv', index=False)

##### Identifying and Dropping Columns with Less Than 10% Data Availability

In [None]:
# Store original column names
original_columns = df.columns

# Drop columns with less than 10% data available
df = df.dropna(thresh=(0.1 * len(df)), axis=1)

# Get the remaining column names after dropping
remaining_columns = df.columns

# Find the dropped column names
dropped_columns = original_columns.difference(remaining_columns)

# Print the dropped column names
print(dropped_columns)

##### Imputing Missing Values Using K-Nearest Neighbors (KNN) Algorithm and Label Encoding

In [None]:
# Selecting important columns from the original DataFrame
imp_df = df[['Position', 'Height', 'Weight', '40 Yard Dash', 'Bench Press', 
             'Vertical Jump', 'Broad Jump', '3 Cone Drill', 'Shuttle', 
             'tackles_solo', 'tackles_assists', 'tackles_loss', 'sacks', 
             'def_int', 'def_int_yds', 'def_int_td', 'pass_defended',
             'fumbles_rec', 'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced', 
             'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'rush_att', 'rush_yds', 
             'rush_yds_per_att', 'rush_td', 'scrim_att', 'scrim_yds', 
             'scrim_yds_per_att', 'scrim_td']]

# Initialize a label encoder for encoding categorical 'Position' column
label_encoder = LabelEncoder()
imp_df.loc[:, 'Position'] = label_encoder.fit_transform(imp_df['Position'])

# Impute missing values using KNN algorithm with k=5
imp_df = KNN(k=5).fit_transform(imp_df)
imp_df = pd.DataFrame(imp_df)

# Rename columns of the DataFrame
imp_df.columns = ['Position', 'Height', 'Weight', '40 Yard Dash', 'Bench Press', 
             'Vertical Jump', 'Broad Jump', '3 Cone Drill', 'Shuttle', 
             'tackles_solo', 'tackles_assists', 'tackles_loss', 'sacks', 
             'def_int', 'def_int_yds', 'def_int_td', 'pass_defended',
             'fumbles_rec', 'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced', 
             'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'rush_att', 'rush_yds', 
             'rush_yds_per_att', 'rush_td', 'scrim_att', 'scrim_yds', 
             'scrim_yds_per_att', 'scrim_td']

# Round the values in the DataFrame to 2 decimal places
imp_df = imp_df.round(2)

# Replace the selected columns in the original DataFrame with the imputed values
df[['Height', 'Weight', '40 Yard Dash', 'Bench Press', 'Vertical Jump', 'Broad Jump',
    '3 Cone Drill', 'Shuttle', 'tackles_solo', 'tackles_assists', 'tackles_loss', 'sacks', 
    'def_int', 'def_int_yds', 'def_int_td', 'pass_defended', 'fumbles_rec', 'fumbles_rec_yds',
    'fumbles_rec_td', 'fumbles_forced', 'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'rush_att',
     'rush_yds', 'rush_yds_per_att', 'rush_td', 'scrim_att', 'scrim_yds','scrim_yds_per_att', 'scrim_td']] = imp_df.drop('Position', axis=1)

##### Imputing Missing Games and Seasons Values Using KNN Algorithm

In [None]:
# Select the columns 'Position', 'games', and 'seasons' from the original DataFrame
imp_df = df[["Position", "games", "seasons"]]

# Encode the 'Position' column using a label encoder
imp_df.loc[:, 'Position'] = label_encoder.fit_transform(imp_df["Position"])

# Impute missing values for 'games' and 'seasons' columns using KNN algorithm with k=10
imp_df=fancyimpute.KNN(k=10).fit_transform(imp_df)
imp_df = pd.DataFrame(imp_df)

# Round the values in the DataFrame to the nearest integer
imp_df = imp_df.round(0)

# Replace the missing values in the original DataFrame for 'Games' and 'Seasons' with the imputed values
df[["Games", "Seasons"]] = imp_df.drop(0, axis=1)

##### Calculating Total Tackles and Exporting Imputed Data to CSV

In [None]:
df['tackles_total'] = df['tackles_solo'] + df['tackles_assists']
df['tackles_total'] = df['tackles_total'].round(0)

# Export the imputed data to a CSV file
df.to_csv('data/imputed_data.csv', index=False)