In [2]:
import pandas as pd
import os, time, random, requests, warnings, gc
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from bs4 import BeautifulSoup

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Define function to retrieve advanced metrics of any year of data for stats of any position and any season

def fetch_metrics_adv(pos, year):
    return f"https://www.fantasypros.com/nfl/advanced-stats-{pos}.php?range=full&year={year}"

In [5]:
def fetch_metrics_all_adv(pos, year):
    try:
        # Create a directory to store CSV files if it doesn't exist
        output_dir = "football-data"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Set the CSV file path
        file_name = f"{output_dir}/{pos}_{year}_adv.csv"
        
        # Fetch the URL using the helper function
        pos_year = fetch_metrics_adv(pos, year)
        pos_year_req = requests.get(pos_year, timeout=15)

        # Parse the HTML content into DataFrame
        pos_year_data = pd.read_html(pos_year_req.text)

        # Check if data is retrieved successfully
        if not pos_year_data or len(pos_year_data) == 0:
            print(f"No data found for position {pos} in year {year}.")
            return pd.DataFrame()  # Return empty DataFrame if no data is found

        # Remove multi-level column index if it exists
        if isinstance(pos_year_data[0].columns, pd.MultiIndex):
            pos_year_data[0].columns = pos_year_data[0].columns.droplevel()

        # Add a 'Season' column to indicate the year automatically using the 'year' parameter
        pos_year_data[0]['Season'] = year

        # Split the 'Player' column into 'Player' and 'Team' columns
        pos_year_data[0]['Team'] = pos_year_data[0]['Player'].str.extract(r'\((\w+)\)')
        pos_year_data[0]['Player'] = pos_year_data[0]['Player'].str.replace(r'\s*\(\w+\)', '', regex=True)

        # Pause to avoid being flagged as a bot
        time.sleep(random.uniform(1, 3))

        # Remove percentage symbol and convert to float where applicable
        if pos == 'qb':
            if 'PCT' in pos_year_data[0].columns:
                pos_year_data[0]['PCT'] = pos_year_data[0]['PCT'].str.replace('%', '').astype(float)

        elif pos in ['wr', 'te']:
            if '% TM' in pos_year_data[0].columns:
                pos_year_data[0]['% TM'] = pos_year_data[0]['% TM'].str.replace('%', '').astype(float)

        # Reorder the columns to place 'Team' and 'Season' right next to 'Player'
        cols = pos_year_data[0].columns.tolist()
        player_index = cols.index('Player')
        cols.insert(player_index + 1, cols.pop(cols.index('Team')))
        cols.insert(player_index + 2, cols.pop(cols.index('Season')))
        pos_year_data[0] = pos_year_data[0][cols]

        # Save data to CSV to avoid fetching again in the future"
        pos_year_data[0].to_csv(file_name, index=False)
        print(f"Data for {pos.upper()} in {year} saved to {file_name}")

        # Clear memory
        del pos_year_data, pos_year_req
        gc.collect()

        # Return the saved data as DataFrame
        return pd.read_csv(file_name)

    except Exception as e:
        print(f"An error occurred for position {pos.upper()} in year {year}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on failure

In [6]:
# Let's fetch our data now!
# Fetch metrics using a loop

positions = ['qb', 'rb', 'wr', 'te']
years = range(2021, 2025)

for pos in positions:
    for year in years:
        print(f"Fetching advanced metrics for {pos.upper()} position in the {year} season...")
        globals()[f"{pos}_{year}_adv"] = fetch_metrics_all_adv(pos, year)

Fetching advanced metrics for QB position in the 2021 season...
Data for QB in 2021 saved to football-data/qb_2021_adv.csv
Fetching advanced metrics for QB position in the 2022 season...
Data for QB in 2022 saved to football-data/qb_2022_adv.csv
Fetching advanced metrics for QB position in the 2023 season...
Data for QB in 2023 saved to football-data/qb_2023_adv.csv
Fetching advanced metrics for QB position in the 2024 season...
Data for QB in 2024 saved to football-data/qb_2024_adv.csv
Fetching advanced metrics for RB position in the 2021 season...
Data for RB in 2021 saved to football-data/rb_2021_adv.csv
Fetching advanced metrics for RB position in the 2022 season...
Data for RB in 2022 saved to football-data/rb_2022_adv.csv
Fetching advanced metrics for RB position in the 2023 season...
Data for RB in 2023 saved to football-data/rb_2023_adv.csv
Fetching advanced metrics for RB position in the 2024 season...
Data for RB in 2024 saved to football-data/rb_2024_adv.csv
Fetching advance

In [7]:
qb_2021_adv

Unnamed: 0,Rank,Player,Team,Season,G,COMP,ATT,PCT,YDS,Y/A,AIR,AIR/A,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,PKT TIME,SACK,KNCK,HRRY,BLITZ,POOR,DROP,RZ ATT,RTG
0,1,Josh Allen,BUF,2021,17,409,646,63.0,4407,6.8,2900,4.5,181,51,16,8,3,2.4,26,63,96,182,108,35,123,92
1,2,Justin Herbert,LAC,2021,17,443,672,66.0,5014,7.5,2867,4.3,208,52,22,15,3,2.4,31,62,50,154,95,38,104,99
2,3,Tom Brady,FA,2021,17,485,719,68.0,5316,7.4,3079,4.3,205,75,23,10,4,2.3,22,30,30,177,135,32,120,102
3,4,Patrick Mahomes II,KC,2021,17,436,658,66.0,4828,7.3,2480,3.8,192,58,21,11,4,2.4,28,58,76,112,116,31,100,100
4,5,Matthew Stafford,LAR,2021,17,404,601,67.0,4886,8.1,2987,5.0,174,65,28,18,10,2.4,30,42,32,138,111,31,112,105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,80,Davis Webb,FA,2021,1,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0
80,81,Feleipe Franks,CAR,2021,6,0,1,0.0,0,0.0,0,0.0,0,0,0,0,0,1.7,0,0,0,0,0,0,0,0
81,82,Logan Woodside,CIN,2021,4,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0
82,83,John Wolford,JAC,2021,2,1,4,25.0,5,1.3,0,0.0,0,0,0,0,0,2.2,1,0,0,0,1,0,0,28


In [8]:
wr_data_adv = pd.concat([wr_2021_adv, wr_2022_adv, wr_2023_adv, wr_2024_adv], ignore_index=True)
wr_data_adv

Unnamed: 0,Rank,Player,Team,Season,G,REC,YDS,Y/R,YBC,YBC/R,AIR,AIR/R,YAC,YAC/R,YACON,YACON/R,BRKTKL,TGT,% TM,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,LNG
0,1,Cooper Kupp,LAR,2021,17,145,1947,13.4,1101,7.6,0,0.0,846,5.8,257,1.8,10,191,31.7,156,8,37,66,30,15,9,3,59
1,2,Deebo Samuel Sr.,SF,2021,16,77,1405,18.9,621,8.0,0,0.0,855,11.0,295,3.8,13,121,24.2,89,10,9,52,24,13,10,6,83
2,3,Ja'Marr Chase,CIN,2021,17,81,1455,18.0,804,9.9,0,0.0,651,8.0,251,3.1,8,128,23.7,95,11,12,51,22,13,8,6,82
3,4,Justin Jefferson,MIN,2021,17,108,1616,14.9,1141,10.5,0,0.0,482,4.4,149,1.4,5,167,28.9,120,7,20,67,27,11,5,2,56
4,5,Davante Adams,NYJ,2021,16,123,1553,12.6,961,7.8,0,0.0,592,4.8,146,1.2,5,169,29.6,127,4,27,66,19,12,4,2,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
761,170,Allen Robinson II,DET,2024,8,2,9,4.5,7,3.5,25,12.5,2,1.0,0,0.0,0,5,1.5,3,1,0,0,0,0,0,0,7
762,171,Ainias Smith,PHI,2024,5,3,6,2.0,-1,-0.3,-1,-0.3,7,2.3,0,0.0,0,3,1.1,3,0,0,0,0,0,0,0,5
763,172,Robbie Chosen,FA,2024,2,1,5,5.0,4,4.0,47,47.0,1,1.0,0,0.0,0,4,1.0,1,0,2,0,0,0,0,0,5
764,173,Chris Conley,SF,2024,10,1,4,4.0,4,4.0,156,156.0,0,0.0,0,0.0,0,7,2.1,1,0,1,0,0,0,0,0,4


#### Advanced metrics retrieved for 2024 season - let's try to fetch general RZ stats now

In [10]:
# Define function to retrieve any year of data for RZ stats of any position and any season
# The mode argument is not required in the fetch_metrics_rz function as standard scoring formats results will appear by default
# To access Half-PPR scoring, the argument should be "HALF" and to access PPR scoring, the argument should be "PPR"
# Yardline argument takes input of 5, 10, 15, or 20 to show RZ stats in different distances from the end zone but will be defaulted to 15

def fetch_metrics_rz(pos, year, yardline='15', mode='STANDARD'):
    return f"https://www.fantasypros.com/nfl/red-zone-stats/{pos}.php?year={year}&yardline={yardline}&scoring={mode}"

In [11]:
## test fetch for url
fetch_metrics_rz('wr', 2024, 15, 'PPR')

'https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&yardline=15&scoring=PPR'

In [12]:
# Define a function to convert percentage columns to floats for use in RZ table extraction
def convert_pct(df, column_name):
    df[column_name] = df[column_name].astype(str)
    df[column_name] = df[column_name].str.replace('%', '').astype(float)

In [13]:
def fetch_metrics_all_rz(pos, year, yardline, mode):
    try:
        # Create a directory to store CSV files if it doesn't exist
        output_dir = "football-data"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Set the CSV file path
        file_name = f"{output_dir}/{pos}_{year}_rz_{yardline}_{mode}.csv"
        
        # Fetch the URL using the fetch_metrics_rz function
        pos_year = fetch_metrics_rz(pos, year, yardline, mode)
        pos_year_req = requests.get(pos_year, timeout=10)

        # Parse the HTML content into DataFrame
        pos_year_data = pd.read_html(pos_year_req.text)

        # Check if data is retrieved successfully
        if not pos_year_data or len(pos_year_data) == 0:
            print(f"No data found for position {pos} in year {year} for scoring mode {mode}.")
            return pd.DataFrame()  # Return empty DataFrame if no data is found

        # Drop the multi-level column if applicable
        if isinstance(pos_year_data[0].columns, pd.MultiIndex):
            pos_year_data[0].columns = pos_year_data[0].columns.droplevel()

        # Add a 'Season' column to indicate the year automatically using the 'year' parameter
        pos_year_data[0]['Season'] = year

        # Split the 'Player' column into 'Player' and 'Team' columns
        pos_year_data[0]['Team'] = pos_year_data[0]['Player'].str.extract(r'\((\w+)\)')
        pos_year_data[0]['Player'] = pos_year_data[0]['Player'].str.replace(r'\s*\(\w+\)', '', regex=True)

        # Introduce a delay to avoid overwhelming the server
        time.sleep(random.uniform(1, 3))

        # Apply renaming of columns based on position
        if pos == 'qb':
            pos_year_data[0].columns.values[4] = 'RZ PASS %'
            pos_year_data[0].columns.values[13] = 'RZ RUSH %'

            # Apply convert_pct function to relevant columns
            for col in ['ROST %', 'RZ PASS %', 'RZ RUSH %']:
                convert_pct(pos_year_data[0], col)

            # Rename column titles to indicate RZ variation
            pos_year_data[0].columns.values[2] = 'RZ CMP'
            pos_year_data[0].columns.values[3] = 'RZ PASS ATT'
            pos_year_data[0].columns.values[5] = 'RZ PASS YDS'
            pos_year_data[0].columns.values[6] = 'RZ Y/A'
            pos_year_data[0].columns.values[7] = 'RZ PASS TD'
            pos_year_data[0].columns.values[8] = 'RZ INT'
            pos_year_data[0].columns.values[9] = 'RZ SACKS'
            pos_year_data[0].columns.values[10] = 'RZ RUSH ATT'
            pos_year_data[0].columns.values[11] = 'RZ RUSH YDS'
            pos_year_data[0].columns.values[12] = 'RZ RUSH TD'
            pos_year_data[0].columns.values[14] = 'RZ FL'
            pos_year_data[0].columns.values[16] = 'RZ FPTS'
            pos_year_data[0].columns.values[17] = 'RZ FPTS/G'

        elif pos == 'rb':
            # Apply convert_pct function to relevant columns
            for col in ['ROST %', 'TGT PCT', 'REC PCT', 'PCT']:
                convert_pct(pos_year_data[0], col)

            # Rename column titles to indicate RZ variation
            pos_year_data[0].columns.values[2] = 'RZ RUSH ATT'
            pos_year_data[0].columns.values[3] = 'RZ RUSH YDS'
            pos_year_data[0].columns.values[4] = 'RZ Y/A'
            pos_year_data[0].columns.values[5] = 'RZ RUSH TD'
            pos_year_data[0].columns.values[6] = 'RZ RUSH %'
            pos_year_data[0].columns.values[7] = 'RZ REC'
            pos_year_data[0].columns.values[8] = 'RZ TGT'
            pos_year_data[0].columns.values[9] = 'RZ REC %'
            pos_year_data[0].columns.values[10] = 'RZ REC YDS'
            pos_year_data[0].columns.values[11] = 'RZ Y/R'
            pos_year_data[0].columns.values[12] = 'RZ REC TD'
            pos_year_data[0].columns.values[13] = 'RZ TGT %'
            pos_year_data[0].columns.values[14] = 'RZ FL'
            pos_year_data[0].columns.values[16] = 'RZ FPTS'
            pos_year_data[0].columns.values[17] = 'RZ FPTS/G'
            pos_year_data[0].columns.values[18] = 'ROST'

        elif pos in ['wr', 'te']:
            # Apply convert_pct function to relevant columns
            for col in ['ROST %', 'TGT PCT', 'REC PCT', 'PCT']:
                convert_pct(pos_year_data[0], col)

            # Rename column titles to indicate RZ variation
            pos_year_data[0].columns.values[2] = 'RZ REC'
            pos_year_data[0].columns.values[3] = 'RZ TGT'
            pos_year_data[0].columns.values[4] = 'RZ REC %'
            pos_year_data[0].columns.values[5] = 'RZ REC YDS'
            pos_year_data[0].columns.values[6] = 'RZ Y/R'
            pos_year_data[0].columns.values[7] = 'RZ TD'
            pos_year_data[0].columns.values[8] = 'RZ TGT %'
            pos_year_data[0].columns.values[9] = 'RZ RUSH ATT'
            pos_year_data[0].columns.values[10] = 'RZ RUSH YDS'
            pos_year_data[0].columns.values[11] = 'RZ RUSH TD'
            pos_year_data[0].columns.values[12] = 'RZ RUSH %'
            pos_year_data[0].columns.values[13] = 'RZ RUSH FL'
            pos_year_data[0].columns.values[15] = 'RZ FPTS'
            pos_year_data[0].columns.values[16] = 'RZ FPTS/G'
            pos_year_data[0].columns.values[17] = 'ROST'

        # Reorder the columns to place 'Team' right next to 'Player' and place 'Season' right next to 'Team'
        cols = pos_year_data[0].columns.tolist()
        player_index = cols.index('Player')
        cols.insert(player_index + 1, cols.pop(cols.index('Team')))
        cols.insert(player_index + 2, cols.pop(cols.index('Season')))
        pos_year_data[0] = pos_year_data[0][cols]

        # Save to CSV for intermediate storage in the "football-data" directory
        pos_year_data[0].to_csv(file_name, index=False)
        print(f"Data for {pos.upper()} in year {year}, at yardline {yardline} saved to {file_name}.")

        # Clear memory
        del pos_year_data, pos_year_req
        gc.collect()

        # Return the saved data as DataFrame (optional)
        return pd.read_csv(file_name)

    except Exception as e:
        print(f"An error occurred for position {pos.upper()} in year {year}, yardline {yardline}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on failure


In [14]:
# Let's fetch our data now!
# Fetch metrics using a loop

# Explaining my choice of RZ metrics:
""" 
I want to retrieve all QB data within 20 yards of the RZ as quarterbacks are involved in every red zone play, and their influence extends throughout 
the entire red zone (20 yards from the end zone). Pulling stats from the 20-yard line will give me a complete view of all the situations in which 
a quarterback is actively contributing to offensive success, including passing plays, scrambles, and even quarterback sneaks. This data helps predict 
how effectively a quarterback can move the ball and create scoring opportunities, regardless of distance.

I want to retrieve all WR data within 15 yards of the RZ because by choosing the 15-yard line, I'm focusing on plays that are closer to the goal line, 
where wide receivers are often used in shorter, quicker routes or contested catches in tight spaces. These routes include slants, fades, curls, or 
crossing routes that happen in congested areas, often with defenders in close coverage. The 15-yard line will focus more on scenarios that involve 
potential scoring plays where wide receivers need to create separation quickly and often face double coverage. This can give us a clear understanding 
of how effective a receiver is in high-pressure situations, such as contested catches or red-zone efficiency.

I want to retrieve all TE data within 10 yards of the RZ because tight ends are often used in goal-line situations for their versatility. At 10 yards 
or closer, they are valuable assets because of their size, blocking ability, and proficiency in short-yardage passing routes. They are frequently 
targeted for quick, short passes in congested areas of the field—plays like seam routes, quick slants, or goal-line fades are typical. Tight ends excel 
in high-probability scoring areas because of their physical stature, which makes them great red zone targets. By choosing the 10-yard line, you focus on
scenarios where tight ends are most likely to convert plays into touchdowns, which provides a clearer picture of their value in scoring opportunities.

I want to retrieve all RB data within 10 yards of the RZ because running backs are most critical in goal-line situations where teams are in short-
yardage scenarios. Therefore, choosing the 10-yard line or closer will capture the stats that best reflect their effectiveness in carrying the ball 
into the end zone, making it most relevant for predicting their success. Running backs are often heavily used in short, powerful runs close to the goal 
line, and limiting the analysis to the 10-yard mark will emphasize situations where they are relied upon the most for scoring.
"""

yardlines = {'qb': 20, 'rb': 10, 'wr': 15, 'te': 10}
positions = ['qb', 'rb', 'wr', 'te']
years = range(2021, 2025)
scoring_modes = ['PPR']

for mode in scoring_modes:
    for pos in positions:
        for year in years:
            yardline = yardlines[pos]
            print(f"Fetching data for position {pos.upper()} in year {year} with scoring mode {mode} and yardline {yardline}...")
            fetched_data = fetch_metrics_all_rz(pos, year, yardline, mode)
            if not fetched_data.empty:
                globals()[f"{pos}_{year}_rz"] = fetched_data

Fetching data for position QB in year 2021 with scoring mode PPR and yardline 20...
Data for QB in year 2021, at yardline 20 saved to football-data/qb_2021_rz_20_PPR.csv.
Fetching data for position QB in year 2022 with scoring mode PPR and yardline 20...
Data for QB in year 2022, at yardline 20 saved to football-data/qb_2022_rz_20_PPR.csv.
Fetching data for position QB in year 2023 with scoring mode PPR and yardline 20...
Data for QB in year 2023, at yardline 20 saved to football-data/qb_2023_rz_20_PPR.csv.
Fetching data for position QB in year 2024 with scoring mode PPR and yardline 20...
Data for QB in year 2024, at yardline 20 saved to football-data/qb_2024_rz_20_PPR.csv.
Fetching data for position RB in year 2021 with scoring mode PPR and yardline 10...
Data for RB in year 2021, at yardline 10 saved to football-data/rb_2021_rz_10_PPR.csv.
Fetching data for position RB in year 2022 with scoring mode PPR and yardline 10...
Data for RB in year 2022, at yardline 10 saved to football-da

In [15]:
wr_2024_rz

Unnamed: 0,Rank,Player,Team,Season,RZ REC,RZ TGT,RZ REC %,RZ REC YDS,RZ Y/R,RZ TD,RZ TGT %,RZ RUSH ATT,RZ RUSH YDS,RZ RUSH TD,RZ RUSH %,RZ RUSH FL,G,RZ FPTS,RZ FPTS/G,ROST
0,1,Amon-Ra St. Brown,DET,2024,11,12,91.7,62,5.6,7,60.0,0,0,0,0.0,0,12,63.5,5.3,99.8
1,2,Drake London,ATL,2024,10,15,66.7,71,7.1,5,71.4,0,0,0,0.0,0,12,47.1,3.9,99.0
2,3,Jayden Reed,GB,2024,6,7,85.7,59,9.8,5,35.0,1,-5,0,100.0,0,12,41.4,3.5,92.5
3,4,Brian Thomas Jr.,JAC,2024,5,8,62.5,42,8.4,5,34.8,0,0,0,0.0,0,12,39.2,3.3,88.4
4,5,Terry McLaurin,WAS,2024,6,7,85.7,30,5.0,5,35.0,0,0,0,0.0,0,13,39.0,3.0,94.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,158,DK Metcalf,SEA,2024,0,5,0.0,0,0.0,0,25.0,0,0,0,0.0,0,10,0.0,0.0,97.6
158,159,Braxton Berrios,MIA,2024,0,1,0.0,0,0.0,0,4.5,0,0,0,0.0,0,6,0.0,0.0,0.2
159,160,Dante Pettis,NO,2024,0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0,3,0.0,0.0,0.0
160,161,Irvin Charles,NYJ,2024,0,1,0.0,0,0.0,0,2.6,0,0,0,0.0,0,11,0.0,0.0,0.0


#### Advanced metrics and RZ stats retrieved for 2024 season - let's try to fetch general metrics now

In [17]:
# Define function to retrieve any year of data for stats of any position and any season
# The mode argument is not required in the fetch_metrics function as standard scoring formats results will appear by default
# To access Half-PPR scoring, the argument should be "HALF" and to access PPR scoring, the argument should be "PPR"

def fetch_metrics(pos, year, mode='STANDARD'):
    return f"https://www.fantasypros.com/nfl/stats/{pos}.php?year={year}&scoring={mode}"

In [18]:
def fetch_metrics_all(pos, year, mode):
    try:
        # Create a directory to store CSV files if it doesn't exist
        output_dir = "football-data"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Set the CSV file path
        file_name = f"{output_dir}/{pos}_{year}_{mode}.csv"
        
        # Fetch the URL using the fetch_metrics function
        pos_year = fetch_metrics(pos, year, mode)
        pos_year_req = requests.get(pos_year, timeout=10)
        
        # Parse the HTML content into DataFrame
        pos_year_data = pd.read_html(pos_year_req.text)

        # Check if data is retrieved successfully
        if not pos_year_data or len(pos_year_data) == 0:
            print(f"No data found for position {pos} in year {year} for scoring mode {mode}.")
            return pd.DataFrame()  # Return empty DataFrame if no data is found

        # Drop the multi-level column if applicable
        if isinstance(pos_year_data[0].columns, pd.MultiIndex):
            pos_year_data[0].columns = pos_year_data[0].columns.droplevel()

        # Reset index if needed
        if isinstance(pos_year_data[0].columns, pd.MultiIndex):
            pos_year_data[0].columns = pos_year_data[0].columns.droplevel()

        # Add a 'Season' column to indicate the year automatically using the 'year' parameter
        pos_year_data[0]['Season'] = year

        # Split the 'Player' column into 'Player' and 'Team' columns
        pos_year_data[0]['Team'] = pos_year_data[0]['Player'].str.extract(r'\((\w+)\)')
        pos_year_data[0]['Player'] = pos_year_data[0]['Player'].str.replace(r'\s*\(\w+\)', '', regex=True)

        # Remove % from ROST to convert obj to float
        if 'ROST' in pos_year_data[0].columns:
            pos_year_data[0]['ROST'] = pos_year_data[0]['ROST'].str.replace('%', '').astype(float)

        # Introduce a delay to avoid being flagged and denied access
        time.sleep(random.uniform(1, 3))
        
        # Rename duplicate column titles to avoid confusion based on the position
        if pos == 'qb':
            pos_year_data[0].columns.values[2] = 'PASS CMP'
            pos_year_data[0].columns.values[3] = 'PASS ATT'
            pos_year_data[0].columns.values[4] = 'PASS PCT'
            pos_year_data[0].columns.values[5] = 'PASS YDS'
            pos_year_data[0].columns.values[6] = 'PASS Y/A'
            pos_year_data[0].columns.values[7] = 'PASS TD'
            pos_year_data[0].columns.values[10] = 'RUSH ATT'
            pos_year_data[0].columns.values[11] = 'RUSH YDS'
            pos_year_data[0].columns.values[12] = 'RUSH TD'
            pos_year_data[0].columns.values[13] = 'FUML'
        
        elif pos == 'rb':
            pos_year_data[0].columns.values[2] = 'RUSH ATT'
            pos_year_data[0].columns.values[3] = 'RUSH YDS'
            pos_year_data[0].columns.values[7] = 'RUSH TD'
            pos_year_data[0].columns.values[10] = 'REC YDS'
            pos_year_data[0].columns.values[12] = 'REC TD'
            pos_year_data[0].columns.values[13] = 'FUML'
        
        elif pos in ['wr', 'te']:
            pos_year_data[0].columns.values[4] = 'REC YDS'
            pos_year_data[0].columns.values[8] = 'REC TD'
            pos_year_data[0].columns.values[9] = 'RUSH ATT'
            pos_year_data[0].columns.values[10] = 'RUSH YDS'
            pos_year_data[0].columns.values[11] = 'RUSH TD'
            pos_year_data[0].columns.values[12] = 'FUML'
        
        # Reorder the columns to place 'Team' right next to 'Player' and place 'Season' right next to 'Team'
        cols = pos_year_data[0].columns.tolist()
        player_index = cols.index('Player')
        cols.insert(player_index + 1, cols.pop(cols.index('Team')))
        cols.insert(player_index + 2, cols.pop(cols.index('Season')))
        pos_year_data[0] = pos_year_data[0][cols]
        
        # Save to CSV for intermediate storage in the "football-data" directory
        pos_year_data[0].to_csv(file_name, index=False)
        print(f"Data for {pos.upper()} in year {year} ({mode}) saved to {file_name}.")

        # Clear memory
        del pos_year_data, pos_year_req
        gc.collect()
        
        # Return the saved data as DataFrame (optional)
        return pd.read_csv(file_name)

    except Exception as e:
        print(f"An error occurred for position {pos} in year {year} ({mode}): {e}")
        return pd.DataFrame()  # Return an empty DataFrame on failure

In [19]:
# Let's fetch our data now!
# Fetch metrics using a loop

positions = ['qb', 'rb', 'wr', 'te']
years = range(2021, 2025)
scoring_mode = 'PPR'

for pos in positions:
    for year in years:
        print(f"Fetching data for position {pos.upper()} in year {year} with {scoring_mode} scoring...")
        fetched_data = fetch_metrics_all(pos, year, scoring_mode)
        if not fetched_data.empty:
            globals()[f"{pos}_{year}"] = fetched_data

Fetching data for position QB in year 2021 with PPR scoring...
Data for QB in year 2021 (PPR) saved to football-data/qb_2021_PPR.csv.
Fetching data for position QB in year 2022 with PPR scoring...
Data for QB in year 2022 (PPR) saved to football-data/qb_2022_PPR.csv.
Fetching data for position QB in year 2023 with PPR scoring...
Data for QB in year 2023 (PPR) saved to football-data/qb_2023_PPR.csv.
Fetching data for position QB in year 2024 with PPR scoring...
Data for QB in year 2024 (PPR) saved to football-data/qb_2024_PPR.csv.
Fetching data for position RB in year 2021 with PPR scoring...
Data for RB in year 2021 (PPR) saved to football-data/rb_2021_PPR.csv.
Fetching data for position RB in year 2022 with PPR scoring...
Data for RB in year 2022 (PPR) saved to football-data/rb_2022_PPR.csv.
Fetching data for position RB in year 2023 with PPR scoring...
Data for RB in year 2023 (PPR) saved to football-data/rb_2023_PPR.csv.
Fetching data for position RB in year 2024 with PPR scoring...

In [20]:
wr_2024

Unnamed: 0,Rank,Player,Team,Season,REC,TGT,REC YDS,Y/R,LG,20+,REC TD,RUSH ATT,RUSH YDS,RUSH TD,FUML,G,FPTS,FPTS/G,ROST
0,1,Ja'Marr Chase,CIN,2024,79,109,1142,14.5,70,14,13,1,11,0,0,12,272.3,22.7,99.8
1,2,Amon-Ra St. Brown,DET,2024,76,92,820,10.8,35,9,9,1,-4,0,0,12,215.9,18.0,99.8
2,3,Justin Jefferson,MIN,2024,68,100,1038,15.3,97,20,5,1,3,0,0,12,203.0,16.9,99.8
3,4,Terry McLaurin,WAS,2024,61,82,896,14.7,86,10,9,2,2,0,1,13,202.8,15.6,94.1
4,5,CeeDee Lamb,DAL,2024,79,124,880,11.1,65,11,4,13,69,0,1,12,197.9,16.5,99.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,331,Jermaine Jackson,NO,2024,0,0,0,0.0,0,0,0,1,-1,0,0,4,-0.1,0.0,0.0
331,332,Kadarius Toney,CLE,2024,0,1,0,0.0,0,0,0,2,-4,0,0,2,-0.4,-0.2,0.6
332,333,Steven Sims Jr.,HOU,2024,0,0,0,0.0,0,0,0,1,0,0,1,7,-2.0,-0.3,0.0
333,334,Jha'Quan Jackson,TEN,2024,1,3,8,8.0,8,0,0,1,-5,0,2,12,-2.7,-0.2,0.1


In [21]:
wr_data = pd.concat([wr_2021, wr_2022, wr_2023, wr_2024], ignore_index=True)
wr_data

Unnamed: 0,Rank,Player,Team,Season,REC,TGT,REC YDS,Y/R,LG,20+,REC TD,RUSH ATT,RUSH YDS,RUSH TD,FUML,G,FPTS,FPTS/G,ROST
0,1,Cooper Kupp,LAR,2021,145,191,1947,13.4,59,30,16,4,18,0,0,17,439.5,25.9,98.6
1,2,Davante Adams,NYJ,2021,123,169,1553,12.6,59,19,11,0,0,0,0,16,344.3,21.5,98.8
2,3,Deebo Samuel Sr.,SF,2021,77,121,1405,18.2,83,24,6,59,365,8,2,16,339.0,21.2,97.6
3,4,Justin Jefferson,MIN,2021,108,167,1616,15.0,56,27,10,6,14,0,1,17,330.4,19.4,99.8
4,5,Ja'Marr Chase,CIN,2021,81,128,1455,18.0,82,22,13,7,21,0,1,17,304.6,17.9,99.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,331,Jermaine Jackson,NO,2024,0,0,0,0.0,0,0,0,1,-1,0,0,4,-0.1,0.0,0.0
1096,332,Kadarius Toney,CLE,2024,0,1,0,0.0,0,0,0,2,-4,0,0,2,-0.4,-0.2,0.6
1097,333,Steven Sims Jr.,HOU,2024,0,0,0,0.0,0,0,0,1,0,0,1,7,-2.0,-0.3,0.0
1098,334,Jha'Quan Jackson,TEN,2024,1,3,8,8.0,8,0,0,1,-5,0,2,12,-2.7,-0.2,0.1


In [22]:
test = wr_data.sort_values(by='FPTS', ascending=False)

In [23]:
test.reset_index(drop=True)

Unnamed: 0,Rank,Player,Team,Season,REC,TGT,REC YDS,Y/R,LG,20+,REC TD,RUSH ATT,RUSH YDS,RUSH TD,FUML,G,FPTS,FPTS/G,ROST
0,1,Cooper Kupp,LAR,2021,145,191,1947,13.4,59,30,16,4,18,0,0,17,439.5,25.9,98.6
1,1,CeeDee Lamb,DAL,2023,135,181,1749,13.0,92,29,12,14,113,2,2,17,403.2,23.7,99.8
2,2,Tyreek Hill,MIA,2023,119,171,1799,15.1,78,29,13,6,15,0,1,16,376.4,23.5,99.5
3,1,Justin Jefferson,MIN,2022,128,184,1809,14.1,64,28,8,4,24,1,0,17,368.6,21.7,99.8
4,2,Davante Adams,NYJ,2021,123,169,1553,12.6,59,19,11,0,0,0,0,16,344.3,21.5,98.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,333,Steven Sims Jr.,HOU,2024,0,0,0,0.0,0,0,0,1,0,0,1,7,-2.0,-0.3,0.0
1096,249,Trent Taylor,SF,2023,0,2,0,0.0,0,0,0,1,-2,0,1,16,-2.2,-0.1,0.0
1097,334,Jha'Quan Jackson,TEN,2024,1,3,8,8.0,8,0,0,1,-5,0,2,12,-2.7,-0.2,0.1
1098,266,Diontae Spencer,FA,2021,1,4,-3,-3.0,0,0,0,0,0,0,2,15,-3.3,-0.2,0.0


In [24]:
key_columns = ['Player', 'Team', 'Season']

common_columns = set(test.columns).intersection(set(wr_data_adv.columns)) - set(key_columns)

wr_data_adv_uniq = wr_data_adv.drop(columns=list(common_columns))

total = pd.merge(test, wr_data_adv_uniq, on=key_columns, how='inner')

In [25]:
features = ['REC','TGT','REC YDS','Y/R','LG','20+','REC TD','RUSH ATT','RUSH YDS','RUSH TD','FUML','G','ROST','YDS','YBC','YBC/R','AIR','AIR/R','YAC','YAC/R','YACON','YACON/R','BRKTKL','% TM','CATCHABLE','DROP','RZ TGT','10+ YDS','20+ YDS','30+ YDS','40+ YDS','50+ YDS','LNG']
target = 'FPTS/G'

X = total[features]
y = total[target]

In [26]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate performance
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 2.7044162532467504


In [28]:
X_week_9_and_onwards = total[features]

# Assuming `model` is already trained
# Use the model to predict fantasy points for Week 9 for each player
predicted_week_9_points = model.predict(X_week_9_and_onwards)

# Add the predictions to the DataFrame to see which player has which predicted points
week_9_data = total.copy()
week_9_data['Predicted_FantasyPoints_Week_9'] = predicted_week_9_points

# Display the first few rows to verify predictions
week_9_data.head(15)

Unnamed: 0,Rank,Player,Team,Season,REC,TGT,REC YDS,Y/R,LG,20+,REC TD,RUSH ATT,RUSH YDS,RUSH TD,FUML,G,FPTS,FPTS/G,ROST,YDS,YBC,YBC/R,AIR,AIR/R,YAC,YAC/R,YACON,YACON/R,BRKTKL,% TM,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,LNG,Predicted_FantasyPoints_Week_9
0,1,Cooper Kupp,LAR,2021,145,191,1947,13.4,59,30,16,4,18,0,0,17,439.5,25.9,98.6,1947,1101,7.6,0,0.0,846,5.8,257,1.8,10,31.7,156,8,37,66,30,15,9,3,59,24.205
1,1,CeeDee Lamb,DAL,2023,135,181,1749,13.0,92,29,12,14,113,2,2,17,403.2,23.7,99.8,1749,1073,7.9,1726,12.8,676,5.0,207,1.5,11,29.9,143,6,31,73,29,8,3,1,92,22.711
2,2,Tyreek Hill,MIA,2023,119,171,1799,15.1,78,29,13,6,15,0,1,16,376.4,23.5,99.5,1799,1146,9.6,1847,15.5,653,5.5,85,0.7,12,31.1,131,12,24,64,29,14,9,5,78,20.881
3,1,Justin Jefferson,MIN,2022,128,184,1809,14.1,64,28,8,4,24,1,0,17,368.6,21.7,99.8,1809,1185,9.3,0,0.0,624,4.9,132,1.0,6,28.7,134,5,28,74,28,14,6,1,64,21.547
4,2,Davante Adams,NYJ,2021,123,169,1553,12.6,59,19,11,0,0,0,0,16,344.3,21.5,98.8,1553,961,7.8,0,0.0,592,4.8,146,1.2,5,29.6,127,4,27,66,19,12,4,2,59,21.016
5,2,Tyreek Hill,MIA,2022,119,170,1710,14.4,64,25,7,7,32,1,0,17,341.2,20.1,99.5,1710,1226,10.3,0,0.0,484,4.1,58,0.5,5,30.2,127,8,9,69,25,9,7,4,64,19.954
6,3,Deebo Samuel Sr.,SF,2021,77,121,1405,18.2,83,24,6,59,365,8,2,16,339.0,21.2,97.6,1405,621,8.0,0,0.0,855,11.0,295,3.8,13,24.2,89,10,9,52,24,13,10,6,83,19.095
7,3,Davante Adams,NYJ,2022,100,180,1516,15.2,60,24,14,3,-1,0,0,17,335.5,19.7,98.8,1516,1023,10.2,0,0.0,493,4.9,96,1.0,9,32.3,109,8,22,61,24,14,7,2,60,18.203
8,3,Amon-Ra St. Brown,DET,2023,119,164,1515,12.7,70,24,10,4,24,0,1,16,330.9,20.7,99.8,1515,847,7.1,1297,10.9,668,5.6,159,1.3,10,28.6,127,8,23,60,24,6,3,1,70,20.729
9,4,Justin Jefferson,MIN,2021,108,167,1616,15.0,56,27,10,6,14,0,1,17,330.4,19.4,99.8,1616,1141,10.5,0,0.0,482,4.4,149,1.4,5,28.9,120,7,20,67,27,11,5,2,56,19.561


#### Ahead is retrieving individual player data

In [30]:
pd.reset_option('display.max_rows')

In [31]:
def clean_wkly_score_opp_cols(df):
    # Extract the result and final score from the 'Score' column
    df[['Result', 'Final Score']] = df['Score'].str.extract(r'([WL])(?:,\s*)(\d+-\d+)')

    # Drop the original 'Score' column
    df.drop(columns=['Score'], inplace=True)

    # Rename the 'Final Score' column to 'Score'
    df.rename(columns={'Final Score': 'Score'}, inplace=True)

    # Remove 'vs.' and '@' from the 'OPP' column
    df['OPP'] = df['OPP'].str.replace(r'vs\.\s*|@\s*', '', regex=True)

    return df

In [32]:
def clean_wkly_df_wr_te(df):

    # Clean scoring and opp columns
    clean_wkly_score_opp_cols(df)

    # Rename columns to discern weekly stats and not season stats
    df.columns.values[0] = 'Week'
    df.columns.values[2] = 'WKLY REC'
    df.columns.values[3] = 'WKLY TGT'
    df.columns.values[4] = 'WKLY REC YDS'
    df.columns.values[5] = 'WKLY Y/R'
    df.columns.values[6] = 'WKLY REC LG'
    df.columns.values[7] = 'WKLY REC TD'
    df.columns.values[8] = 'WKLY RUSH ATT'
    df.columns.values[9] = 'WKLY RUSH YDS'
    df.columns.values[10] = 'WKLY Y/A'
    df.columns.values[11] = 'WKLY RUSH LG'
    df.columns.values[12] = 'WKLY RUSH TD'
    df.columns.values[13] = 'WKLY FUM'
    df.columns.values[14] = 'WKLY FUML'
    df.columns.values[15] = 'WKLY PTS'

    # Explicitly drop columns that are not needed
    columns_to_drop = ['fum', 'fuml', 'Points']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
    
    return df

In [33]:
def clean_wkly_df_rb(df):

    # Clean scoring and opp columns
    clean_wkly_score_opp_cols(df)

    # Rename columns to discern weekly stats and not season stats
    df.columns.values[0] = 'Week'
    df.columns.values[2] = 'WKLY RUSH ATT'
    df.columns.values[3] = 'WKLY RUSH YDS'
    df.columns.values[4] = 'WKLY Y/A'
    df.columns.values[5] = 'WKLY RUSH LG'
    df.columns.values[6] = 'WKLY RUSH TD'
    df.columns.values[7] = 'WKLY REC'
    df.columns.values[8] = 'WKLY TGT'
    df.columns.values[9] = 'WKLY REC YDS'
    df.columns.values[10] = 'WKLY Y/R'
    df.columns.values[11] = 'WKLY REC LG'
    df.columns.values[12] = 'WKLY REC TD'
    df.columns.values[13] = 'WKLY FUM'
    df.columns.values[14] = 'WKLY FUML'
    df.columns.values[15] = 'WKLY PTS'

    return df

In [34]:
def clean_wkly_df_qb(df):

    # Clean scoring and opp columns
    clean_wkly_score_opp_cols(df)

    # Rename columns to discern weekly stats and not season stats
    df.columns.values[0] = 'Week'
    df.columns.values[2] = 'QB RAT'
    df.columns.values[3] = 'WKLY CMP'
    df.columns.values[4] = 'WKLY PASS ATT'
    df.columns.values[5] = 'WKLY PASS PCT'
    df.columns.values[6] = 'WKLY PASS YDS'
    df.columns.values[7] = 'WKLY PASS Y/A'
    df.columns.values[8] = 'WKLY PASS TD'
    df.columns.values[9] = 'WKLY INT'
    df.columns.values[10] = 'WKLY SACKS'
    df.columns.values[11] = 'WKLY RUSH ATT'
    df.columns.values[12] = 'WKLY RUSH YDS'
    df.columns.values[13] = 'WKLY RUSH Y/A'
    df.columns.values[14] = 'WKLY RUSH LG'
    df.columns.values[15] = 'WKLY RUSH TD'
    df.columns.values[16] = 'WKLY FUM'
    df.columns.values[17] = 'WKLY FUML'
    df.columns.values[18] = 'WKLY PTS'

    return df

In [35]:
def scrape_player_stats(position, season_year, mode):
    try:
        # Create a directory to store CSV files if it doesn't exist
        output_dir = "football-data"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Set the new CSV file path using the new naming convention
        file_name = f"{output_dir}/wkly_{position}_{season_year}_{mode}.csv"

        # Fetch the URL using the fetch_metrics function
        url = fetch_metrics(position, season_year, mode)
        data = requests.get(url, timeout=10)

        # Use BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(data.text, 'html.parser')

        # Select the first table with the specified class
        try:
            test_table = soup.select('table.table')[0]
        except IndexError:
            print(f"No data found for position {position} in {season_year} for scoring mode {mode}.")
            return pd.DataFrame()  # Return an empty DataFrame if no table is found

        # Find all rows in the table
        rows = test_table.find_all('tr')

        # Create lists to store player names and URLs
        player_names = []
        player_urls = []

        # Loop through each row to extract player names and URLs
        for row in rows:
            # Find the <a> tag with the player name and URL
            player_link = row.find('a', class_='player-name')
            if player_link:
                # Extract the player's name from the 'fp-player-name' attribute
                player_name = player_link.get('fp-player-name')
                player_url = f"https://www.fantasypros.com{player_link.get('href')}"

                # Store the name and URL
                player_names.append(player_name)
                player_urls.append(player_url)

        # Check if there are player names and URLs
        if not player_urls:
            print(f"No player URLs found for position {position} in {season_year} for scoring mode {mode}.")
            return pd.DataFrame()  # Return an empty DataFrame if no player URLs are found

        # Prepare the URLs for game logs using the season_year directly from the loop
        players_stats_urls = [
            url.replace('stats', 'games') + f'?scoring={mode}&season={season_year}'
            for url in player_urls
        ]

        # Create an empty list to store all player DataFrames
        all_players_data = []

        # Loop through each player's URL and name to scrape weekly stats
        for player_url, player_name in zip(players_stats_urls, player_names):
            try:
                # Introduce a delay to avoid being flagged and denied access
                time.sleep(random.uniform(1, 3))

                # Extract the season year from the URL
                season = player_url.split('season=')[-1]

                # Get the player's game log data
                matches_req = requests.get(player_url, timeout=10)
                matches = pd.read_html(matches_req.text, match="Game Log")

                # Check if the game log table was found and is not empty
                if len(matches) > 0:
                    # Extract the DataFrame
                    player_df = matches[0]

                    # Remove any multi-level columns
                    if isinstance(player_df.columns, pd.MultiIndex):
                        player_df.columns = player_df.columns.droplevel()

                    # Add the player's name, season, position, and mode as new columns
                    player_df['Player'] = player_name
                    player_df['Season'] = season
                    player_df['Position'] = position.upper()
                    player_df['Scoring Mode'] = mode

                    # Clean weekly data based on position and reorder columns
                    if position in ['wr', 'te']:
                        clean_wkly_df_wr_te(player_df)
                    elif position == 'rb':
                        clean_wkly_df_rb(player_df)
                    elif position == 'qb':
                        clean_wkly_df_qb(player_df)

                    # Reorder the columns to have important columns at the front
                    reorder = ['Scoring Mode', 'Season', 'Position', 'Player', 'Week', 'OPP', 'Score', 'Result'] + \
                              [col for col in player_df.columns if col not in ['Scoring Mode', 'Season', 'Position', 'Player', 'Week', 'OPP', 'Score', 'Result']]
                    player_df = player_df[reorder]

                    # Remove the 'Totals' row if it exists
                    player_df = player_df[player_df['Week'] != 'Totals']

                    # Reset the index of the DataFrame to avoid index issues during concatenation
                    player_df.reset_index(drop=True, inplace=True)

                    # Drop duplicate columns if there are any
                    player_df = player_df.loc[:, ~player_df.columns.duplicated()]

                    # Append the cleaned DataFrame to the list
                    all_players_data.append(player_df)

                else:
                    print(f"No Game Log data found for player: {player_name} ({position}, {season_year}, {mode})")

            except Exception as e:
                print(f"Failed to retrieve data for {player_name} in season {season_year} ({position}, {mode}): {e}")

        # Combine all DataFrames into a single DataFrame for the current season and position
        if all_players_data:
            combined_season_position_data = pd.concat(all_players_data, ignore_index=True)
        else:
            combined_season_position_data = pd.DataFrame()  # Return an empty DataFrame if no data was collected

        # Save combined data to CSV in the "football-data" directory to manage memory
        combined_season_position_data.to_csv(file_name, index=False)
        print(f"Data for {position.upper()} in year {season_year} ({mode}) saved to {file_name}.")

        # Clear memory
        del combined_season_position_data, all_players_data, player_names, player_urls, players_stats_urls
        gc.collect()

        # Return the saved data as DataFrame (optional)
        return pd.read_csv(file_name)

    except Exception as e:
        print(f"An error occurred for position {position} in {season_year} ({mode}): {e}")
        return pd.DataFrame()  # Return an empty DataFrame on failure

In [88]:
#from bs4 import BeautifulSoup

# List of positions to scrape data for
#positions = ['wr', 'rb', 'qb', 'te']
positions = ['wr', 'rb', 'qb', 'te']

# List of seasons to scrape data for
#seasons = [2021, 2022, 2023, 2024]
seasons = [2021, 2022, 2023, 2024]

# List of scoring modes to scrape data for, we are interested in PPR stats
#scoring_modes = ['STANDARD', 'HALF', 'PPR']
scoring_modes = ['PPR']

# Create a dictionary to store each position's data
position_data = {position: [] for position in positions}

# Loop through each scoring mode, position, and season
for mode in scoring_modes:
    for position in positions:
        for season_year in seasons:
            print(f"Scraping data for the {position.upper()} position in the {season_year} season with {mode} scoring...")
            season_position_data = scrape_player_stats(position, season_year, mode)
            if not season_position_data.empty:
                # Store the new CSV filename with the correct path for later use
                csv_file_name = f"football-data/wkly_{position}_{season_year}_{mode}.csv"
                position_data[position].append(csv_file_name)

# Find the range of years in the seasons list
year_range = f"{min(seasons)}_{max(seasons)}"

# Create a list to store all DataFrames for each position
all_positions_data = []

# Read and combine each position's data from all seasons into separate DataFrames
for position, csv_files in position_data.items():
    all_data = [pd.read_csv(file) for file in csv_files if os.path.exists(file)]
    combined_position_data = pd.concat(all_data, ignore_index=True)
    # Save the combined data for each position into a CSV file
    combined_file_name = f"football-data/wkly_{position}_{year_range}.csv"
    combined_position_data.to_csv(combined_file_name, index=False)
    print(f"Combined data for {position.upper()} saved to {combined_file_name}.")
    
    # Add the combined position data to the list of all positions
    all_positions_data.append(combined_position_data)

# Combine all positions' data into a single DataFrame
combined_all_data = pd.concat(all_positions_data, ignore_index=True)

# Save the combined data for all positions into a CSV file
combined_all_positions_file = f"football-data/wkly_all_positions_{year_range}.csv"
combined_all_data.to_csv(combined_all_positions_file, index=False)
print(f"Combined data for all positions saved to {combined_all_positions_file}.")

# If you want to display the combined DataFrame for all positions
# print(combined_all_data)

Scraping data for the WR position in the 2021 season with PPR scoring...
Data for WR in year 2021 (PPR) saved to football-data/wkly_wr_2021_PPR.csv.
Scraping data for the WR position in the 2022 season with PPR scoring...
Data for WR in year 2022 (PPR) saved to football-data/wkly_wr_2022_PPR.csv.
Scraping data for the WR position in the 2023 season with PPR scoring...
Data for WR in year 2023 (PPR) saved to football-data/wkly_wr_2023_PPR.csv.
Scraping data for the WR position in the 2024 season with PPR scoring...
Data for WR in year 2024 (PPR) saved to football-data/wkly_wr_2024_PPR.csv.
Scraping data for the RB position in the 2021 season with PPR scoring...
Failed to retrieve data for Nick Bellore in season 2021 (rb, PPR): "['Week', 'WKLY RUSH ATT', 'WKLY RUSH YDS', 'WKLY Y/A', 'WKLY RUSH LG', 'WKLY RUSH TD', 'WKLY REC', 'WKLY TGT', 'WKLY REC YDS', 'WKLY Y/R', 'WKLY REC LG', 'WKLY REC TD', 'WKLY FUM', 'WKLY FUML', 'WKLY PTS'] not in index"
Failed to retrieve data for Sutton Smith in

In [86]:
# combined_all_data

#### Now let's compile some defensive data to pair with our weekly stats

In [90]:
# Define function to retrieve defensive metrics for any season
def fetch_metrics_def_stats(year):
    return f"https://www.fantasypros.com/nfl/stats/dst.php?year={year}"

def fetch_metrics_def_rankings(year):
    return f"https://www.fantasypros.com/nfl/points-allowed.php?year={year}"

In [92]:
def fetch_metrics_def_all(year):
    try:
        # Introduce a delay to avoid being flagged and denied access
        time.sleep(random.uniform(1, 3))

        # Fetch the defensive stats and rankings URLs
        def_stats = fetch_metrics_def_stats(year)
        def_ranks = fetch_metrics_def_rankings(year)

        # Get defensive stats data
        def_stats_req = requests.get(def_stats, timeout=10)
        def_stats_df = pd.read_html(def_stats_req.text)

        # Rename columns to discern Position Rank
        def_stats_data = def_stats_df[0]
        def_stats_data[['Team', 'Team Code']] = def_stats_data['Player'].str.extract(r'^(.*)\s\((\w+)\)$')

        # Add a 'Season' column to each dataset to indicate the year automatically using the 'year' parameter
        def_stats_data['Season'] = year

        # Reorder the columns
        reorder_def = ['Rank', 'Player', 'Team', 'Team Code', 'Season'] + [col for col in def_stats_data.columns if col not in ['Rank', 'Player', 'Team', 'Team Code', 'Season']]
        def_stats_data = def_stats_data[reorder_def]

        # Drop redundant columns
        def_stats_data.drop(columns=['Player', 'ROST'], inplace=True)

        # Get defensive ranks data
        def_ranks_req = requests.get(def_ranks, timeout=10)
        def_ranks_df = pd.read_html(def_ranks_req.text)

        # Drop rows where all values are NaN
        def_ranks_data = def_ranks_df[0].dropna(how='all')

        # Add a 'Season' column to each dataset to indicate the year automatically using the 'year' parameter
        def_ranks_data['Season'] = year
        # Reset the index after dropping rows
        def_ranks_data.reset_index(drop=True, inplace=True)

        # Convert all 'Rank' columns to integers
        rank_columns = [col for col in def_ranks_data.columns if 'Rank' in col]
        def_ranks_data[rank_columns] = def_ranks_data[rank_columns].apply(pd.to_numeric, errors='coerce').astype('Int64')

        # Merge the two DataFrames on the 'Team' and 'Season' columns
        def_full_data = pd.merge(def_stats_data, def_ranks_data, on=['Team', 'Season'], how='inner')

        # Rename columns to discern Position Rank
        def_full_data.columns.values[0] = 'DEF RANK'
        def_full_data.columns.values[14] = 'QB DEF RANK'
        def_full_data.columns.values[15] = 'QB AVG PTS GIVEN'
        def_full_data.columns.values[16] = 'RB DEF RANK'
        def_full_data.columns.values[17] = 'RB AVG PTS GIVEN'
        def_full_data.columns.values[18] = 'WR DEF RANK'
        def_full_data.columns.values[19] = 'WR AVG PTS GIVEN'
        def_full_data.columns.values[20] = 'TE DEF RANK'
        def_full_data.columns.values[21] = 'TE AVG PTS GIVEN'
        def_full_data.columns.values[22] = 'K DEF RANK'
        def_full_data.columns.values[23] = 'K AVG PTS GIVEN'
        def_full_data.columns.values[24] = 'D/ST DEF RANK'
        def_full_data.columns.values[25] = 'D/ST AVG PTS GIVEN'

        # Save to CSV in the "football-data" directory
        output_dir = "football-data"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        file_name = f"{output_dir}/def_stats_{year}.csv"
        def_full_data.to_csv(file_name, index=False)
        print(f"Data for defensive metrics in year {year} saved to {file_name}.")

        # Clear memory
        del def_stats_data, def_ranks_data, def_full_data
        gc.collect()

        # Return the saved data as DataFrame (optional)
        return pd.read_csv(file_name)

    except Exception as e:
        print(f"An error occurred while fetching defensive metrics for year {year}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on failure

In [94]:
pd.set_option('display.max_rows', None)

In [96]:
# List of seasons to scrape data for
# import gc

def_seasons = [2021, 2022, 2023, 2024]

# Create an empty list to store each season's data for all positions and scoring modes
all_def_data = []

# Loop through each scoring mode, position, and season
for season_year in def_seasons:
    print(f"Scraping data for the D/ST position in the {season_year} season...")
    def_season_position_data = fetch_metrics_def_all(season_year)
    if not def_season_position_data.empty:
        all_def_data.append(def_season_position_data)
        # Save the intermediate result to a CSV to free up memory
        def_season_position_data.to_csv(f"def_data_{season_year}.csv", index=False)
        # Clean up memory
        del def_season_position_data
        gc.collect()

# Combine all the seasons into one final DataFrame by reading from saved CSVs
combined_def_data = pd.concat([pd.read_csv(f"def_data_{year}.csv") for year in def_seasons], ignore_index=True)


combined_def_data

Scraping data for the D/ST position in the 2021 season...
Data for defensive metrics in year 2021 saved to football-data/def_stats_2021.csv.
Scraping data for the D/ST position in the 2022 season...
Data for defensive metrics in year 2022 saved to football-data/def_stats_2022.csv.
Scraping data for the D/ST position in the 2023 season...
Data for defensive metrics in year 2023 saved to football-data/def_stats_2023.csv.
Scraping data for the D/ST position in the 2024 season...
Data for defensive metrics in year 2024 saved to football-data/def_stats_2024.csv.


Unnamed: 0,DEF RANK,Team,Team Code,Season,SACK,INT,FR,FF,DEF TD,SFTY,SPC TD,G,FPTS,FPTS/G,QB DEF RANK,QB AVG PTS GIVEN,RB DEF RANK,RB AVG PTS GIVEN,WR DEF RANK,WR AVG PTS GIVEN,TE DEF RANK,TE AVG PTS GIVEN,K DEF RANK,K AVG PTS GIVEN,D/ST DEF RANK,D/ST AVG PTS GIVEN
0,1,Dallas Cowboys,DAL,2021,41,26,8,13,6,0,3,17,185.0,10.9,23,17.1,23,16.8,13,22.8,18,7.1,13,8.4,27,4.1
1,2,New England Patriots,NE,2021,36,23,7,11,3,0,0,17,158.0,9.3,31,13.5,19,17.4,31,18.5,32,4.2,26,6.4,15,6.6
2,3,Miami Dolphins,MIA,2021,48,14,13,14,5,0,1,17,155.0,9.1,21,17.2,12,18.4,22,21.4,10,8.5,11,8.5,11,7.5
3,4,Buffalo Bills,BUF,2021,42,19,11,10,1,0,0,17,151.0,8.9,32,12.8,16,17.8,32,13.9,30,4.7,32,5.2,31,3.8
4,5,New Orleans Saints,NO,2021,46,18,7,12,2,0,0,17,145.0,8.5,16,17.9,32,13.9,12,23.1,23,6.4,14,8.4,13,6.8
5,6,Tampa Bay Buccaneers,TB,2021,47,17,12,15,2,0,0,17,141.0,8.3,9,18.4,27,15.7,27,19.7,12,7.8,23,7.0,29,3.9
6,7,Indianapolis Colts,IND,2021,33,19,14,17,1,1,2,17,138.0,8.1,10,18.3,28,15.6,14,22.7,7,9.2,30,6.1,30,3.8
7,8,Kansas City Chiefs,KC,2021,31,15,14,18,4,0,0,17,131.0,7.7,2,20.3,20,17.3,15,22.7,15,7.4,28,6.2,25,4.6
8,9,Arizona Cardinals,ARI,2021,41,13,14,20,2,0,0,17,126.0,7.4,25,16.9,29,15.5,5,24.7,29,4.7,19,7.7,24,4.9
9,10,Minnesota Vikings,MIN,2021,51,16,8,13,2,0,2,17,124.0,7.3,6,18.8,9,19.6,1,26.9,21,6.8,6,9.0,28,4.0
