# Fantasy Football - A ML analysis of the past decade

### Base Aggregation Code

In [81]:
import pandas as pd

urls = [
    "https://www.pro-football-reference.com/years/2024/rushing.htm",
    "https://www.pro-football-reference.com/years/2024/receiving.htm"
]

#-----------------------------RUSHING DATA------------------------------------
rudf = pd.read_html(urls[0])[0]

# Flatten columns if multi-level
if isinstance(rudf.columns, pd.MultiIndex):
    rudf.columns = [' '.join(col).strip() for col in rudf.columns.values]

# Assign proper column names manually
rudf.columns = [
    "Rk", "Player", "Age", "Team", "Pos", "G", "GS",
    "Rush_Att", "Rush_Yds", "Rush_TD", "Rush_1D", "Rush_Succ%",
    "Rush_Lng", "Rush_Y/A", "Rush_Y/G", "Rush_A/G", "Fmb", "Awards"
]

# Drop columns you don't need
df = rudf.drop(columns=["Rk", "Awards"])

# Clean up column names
df.columns = (
    df.columns.str.strip().str.replace(r'Unnamed.*', '', regex=True).str.replace('\n', ' ').str.strip())

#-----------------------------RECEIVING DATA---------------------------------
redf = pd.read_html(urls[1])[0]

# Flatten columns if multi-level
if isinstance(redf.columns, pd.MultiIndex):
    redf.columns = [' '.join(col).strip() for col in redf.columns.values]

# Assign proper column names manually
redf.columns = [
    "Rk", "Player", "Age", "Team", "Pos", "G", "GS",
    "Tgt", "Rec", "Yds", "Y/R", "TD",
    "1D", "Succ%", "Lng", "R/G", "Y/G", "Ctch%", "Y/Tgt", "Fmb", "Awards"
]

# Drop columns you don't need
redf = redf.drop(columns=["Rk", "Awards","Age", "Pos", "G", "GS", "Fmb"])

# Clean up column names
redf.columns = (
    redf.columns.str.strip().str.replace(r'Unnamed.*', '', regex=True).str.replace('\n', ' ').str.strip())

#-------------------------------MERGE TABLES---------------------------------
df = pd.merge(rudf, redf, on=['Player', 'Team'], how='left')
df = df.drop(columns=['Rk'], errors ='ignore')

#drop any NaNs and QBs
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(0.0)
df = df[df["Player"] != "League Average"]


#Add Fantasy Points columns
df['Fpts'] = (
    df['Rush_Yds'] * 0.1 +
    df['Yds'] * 0.1 +
    df['Rush_TD'] * 6 + 
    df['TD'] * 6 + 
    df['Rec'] * 1
)
df["Fppg"] = df["Fpts"] / df['G']

#Remove bums...
df = df[df['Pos'].isin(['RB','WR','TE'])]
df = df.sort_values('Fpts', ascending=False).head(125)
df = df.sort_values('Fpts', ascending=False)
df

Unnamed: 0,Player,Age,Team,Pos,G,GS,Rush_Att,Rush_Yds,Rush_TD,Rush_1D,...,TD,1D,Succ%,Lng,R/G,Y/G,Ctch%,Y/Tgt,Fpts,Fppg
235,Ja'Marr Chase,24.0,CIN,WR,17.0,16.0,3.0,32.0,0.0,2.0,...,17.0,75.0,62.3,70.0,7.5,100.5,72.6,9.8,403.0,23.705882
10,Jahmyr Gibbs,22.0,DET,RB,17.0,4.0,250.0,1412.0,16.0,70.0,...,4.0,25.0,55.6,54.0,3.1,30.4,82.5,8.2,364.9,21.464706
0,Saquon Barkley,27.0,PHI,RB,16.0,16.0,345.0,2005.0,13.0,82.0,...,2.0,12.0,44.2,43.0,2.1,17.4,76.7,6.5,351.3,21.956250
3,Bijan Robinson,22.0,ATL,RB,17.0,17.0,304.0,1456.0,14.0,82.0,...,1.0,20.0,41.7,29.0,3.6,25.4,84.7,6.0,339.7,19.982353
1,Derrick Henry,30.0,BAL,RB,17.0,17.0,325.0,1921.0,16.0,94.0,...,2.0,11.0,68.2,27.0,1.1,11.4,86.4,8.8,338.4,19.905882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,Stefon Diggs,31.0,HOU,WR,8.0,8.0,3.0,8.0,1.0,1.0,...,3.0,31.0,51.6,49.0,5.9,62.0,73.4,7.8,121.4,15.175000
495,Tyler Lockett,32.0,SEA,WR,17.0,14.0,0.0,0.0,0.0,0.0,...,2.0,36.0,56.8,37.0,2.9,35.3,66.2,8.1,121.0,7.117647
325,Elijah Moore,24.0,CLE,WR,17.0,13.0,1.0,1.0,0.0,0.0,...,1.0,25.0,35.3,44.0,3.6,31.6,59.8,5.3,120.9,7.111765
377,Kayshon Boutte,22.0,NWE,WR,15.0,13.0,0.0,0.0,0.0,0.0,...,3.0,25.0,55.9,48.0,2.9,39.3,63.2,8.7,119.9,7.993333


## Aggregate Data from 2014-2024
Scraping, Feature Engineering, and Formatting 

In [83]:
#Aggregate data from 2014-2024

all_seasons = []

for year in range(2014,2025):
    urls = [
    f"https://www.pro-football-reference.com/years/{year}/rushing.htm",
    f"https://www.pro-football-reference.com/years/{year}/receiving.htm"]

    #-----------------------------RUSHING DATA------------------------------------
    rudf = pd.read_html(urls[0])[0]

    # Flatten columns if multi-level
    if isinstance(rudf.columns, pd.MultiIndex):
        rudf.columns = [' '.join(col).strip() for col in rudf.columns.values]

    # Assign proper column names manually
    rudf.columns = [
        "Rk", "Player", "Age", "Team", "Pos", "G", "GS",
        "Rush_Att", "Rush_Yds", "Rush_TD", "Rush_1D", "Rush_Succ%",
        "Rush_Lng", "Rush_Y/A", "Rush_Y/G", "Rush_A/G", "Fmb", "Awards"
    ]

    # Drop columns you don't need
    df = rudf.drop(columns=["Rk", "Awards"])

    # Clean up column names
    df.columns = (
        df.columns.str.strip().str.replace(r'Unnamed.*', '', regex=True).str.replace('\n', ' ').str.strip())

    #-----------------------------RECEIVING DATA---------------------------------
    redf = pd.read_html(urls[1])[0]

    # Flatten columns if multi-level
    if isinstance(redf.columns, pd.MultiIndex):
        redf.columns = [' '.join(col).strip() for col in redf.columns.values]

    # Assign proper column names manually
    redf.columns = [
        "Rk", "Player", "Age", "Team", "Pos", "G", "GS",
        "Tgt", "Rec", "Yds", "Y/R", "TD",
        "1D", "Succ%", "Lng", "R/G", "Y/G", "Ctch%", "Y/Tgt", "Fmb", "Awards"
    ]

    # Drop columns you don't need
    redf = redf.drop(columns=["Rk", "Awards","Age", "Pos", "G", "GS", "Fmb"])

    # Clean up column names
    redf.columns = (
        redf.columns.str.strip().str.replace(r'Unnamed.*', '', regex=True).str.replace('\n', ' ').str.strip())

    #-------------------------------MERGE TABLES---------------------------------
    df = pd.merge(rudf, redf, on=['Player', 'Team'], how='left')
    df = df.drop(columns=['Rk'], errors ='ignore')

    #drop any NaNs and QBs
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(0.0)
    df = df[df["Player"] != "League Average"]


    #Add Fantasy Points  and Season columns
    df['Fpts'] = (
        df['Rush_Yds'] * 0.1 +
        df['Yds'] * 0.1 +
        df['Rush_TD'] * 6 + 
        df['TD'] * 6 + 
        df['Rec'] * 1
    )
    df["Fppg"] = df["Fpts"] / df['G']

    df['Season'] = year

    #Remove bums...
    df = df[df['Pos'].isin(['RB','WR','TE'])]
    df = df.sort_values('Fpts', ascending=False).head(125)
    df = df.sort_values('Fpts', ascending=False)

    #Add the df to the big list
    all_seasons.append(df)

df_all = pd.concat(all_seasons, ignore_index=True)
df_all.reset_index(drop=True, inplace=True)
df_all = df_all.sort_values('Fpts', ascending=False)

df_all.head(25)


Unnamed: 0,Player,Age,Team,Pos,G,GS,Rush_Att,Rush_Yds,Rush_TD,Rush_1D,...,1D,Succ%,Lng,R/G,Y/G,Ctch%,Y/Tgt,Fpts,Fppg,Season
625,Christian McCaffrey,23.0,CAR,RB,16.0,16.0,287.0,1387.0,15.0,57.0,...,58.0,59.9,28.0,7.3,62.8,81.7,7.1,469.2,29.325,2019
875,Cooper Kupp,28.0,LAR,WR,17.0,17.0,4.0,18.0,0.0,1.0,...,89.0,63.4,59.0,8.5,114.5,75.9,10.2,437.5,25.735294,2021
250,David Johnson,25.0,ARI,RB,16.0,16.0,293.0,1239.0,16.0,73.0,...,42.0,48.3,58.0,5.0,54.9,66.7,7.3,411.8,25.7375,2016
1125,CeeDee Lamb,24.0,DAL,WR,17.0,17.0,14.0,113.0,2.0,6.0,...,80.0,63.5,92.0,7.9,102.9,74.6,9.7,405.2,23.835294,2023
1250,Ja'Marr Chase,24.0,CIN,WR,17.0,16.0,3.0,32.0,0.0,2.0,...,75.0,62.3,70.0,7.5,100.5,72.6,9.8,403.0,23.705882,2024
1126,Christian McCaffrey,27.0,SFO,RB,16.0,16.0,272.0,1459.0,14.0,83.0,...,31.0,55.4,41.0,4.2,35.3,80.7,6.8,395.3,24.70625,2023
375,Todd Gurley,23.0,LAR,RB,15.0,15.0,279.0,1305.0,13.0,69.0,...,32.0,51.7,80.0,4.3,52.5,73.6,9.1,387.3,25.82,2017
500,Saquon Barkley,21.0,NYG,RB,16.0,16.0,261.0,1307.0,11.0,50.0,...,30.0,38.0,57.0,5.7,45.1,75.2,6.0,383.8,23.9875,2018
125,Antonio Brown,27.0,PIT,WR,16.0,16.0,3.0,28.0,0.0,1.0,...,84.0,56.0,59.0,8.5,114.6,70.5,9.5,382.2,23.8875,2015
501,Christian McCaffrey,22.0,CAR,RB,16.0,16.0,219.0,1098.0,7.0,53.0,...,41.0,55.6,38.0,6.7,54.2,86.3,7.0,381.5,23.84375,2018


In [86]:
#Assign numerical values to Team and Position
df_encoded = pd.get_dummies(df_all, columns=['Team', 'Pos'], dtype=int)
df_encoded.head()

Unnamed: 0,Player,Age,G,GS,Rush_Att,Rush_Yds,Rush_TD,Rush_1D,Rush_Succ%,Rush_Lng,...,Team_SDG,Team_SEA,Team_SFO,Team_STL,Team_TAM,Team_TEN,Team_WAS,Pos_RB,Pos_TE,Pos_WR
625,Christian McCaffrey,23.0,16.0,16.0,287.0,1387.0,15.0,57.0,46.3,84.0,...,0,0,0,0,0,0,0,1,0,0
875,Cooper Kupp,28.0,17.0,17.0,4.0,18.0,0.0,1.0,50.0,18.0,...,0,0,0,0,0,0,0,0,0,1
250,David Johnson,25.0,16.0,16.0,293.0,1239.0,16.0,73.0,48.8,58.0,...,0,0,0,0,0,0,0,1,0,0
1125,CeeDee Lamb,24.0,17.0,17.0,14.0,113.0,2.0,6.0,85.7,24.0,...,0,0,0,0,0,0,0,0,0,1
1250,Ja'Marr Chase,24.0,17.0,16.0,3.0,32.0,0.0,2.0,100.0,14.0,...,0,0,0,0,0,0,0,0,0,1


## Clustering with KMeans
Enter a player season and find comparisons. These can help predict a player's career trajectory based on a previous season's performance

In [116]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


#Isolate features to cluster by
features_for_clustering = [
    'Age','Rush_Att', 'Rush_Yds', 'Rush_TD', 'Rush_Y/A',
    'Rec', 'Yds', 'TD', 'Y/R', 'Y/Tgt',
    'Fppg', 'Fpts'
] + [col for col in df_encoded.columns if col.startswith('Pos_')]

X = df_encoded[features_for_clustering]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Fit kmeans
k=120
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_all['Cluster'] = kmeans.fit_predict(X_scaled)

clusters_dict = {cid: df_all[df_all['Cluster']==cid] for cid in range(k)}


#Function to access season comparisons:
def get_comps(name, year):
    target = df_all[(df_all['Player'] == name) & (df_all['Season'] == year)]
    if target.empty:
        print('Player/Season not found.')
        return None

    cluster_id = target['Cluster'].iloc[0]

    comps = df_all[(df_all['Cluster'] == cluster_id) & ~((df_all['Player'] == name) & (df_all['Season'] == year))]

    return comps[['Player', 'Season', 'Fppg', 'Age']].sort_values('Fppg', ascending = False)


#Test Clustering Function
namein, yearin = input('Player Name: '), int(input('Year: '))
top_comps = get_comps(namein, yearin)
print(f'Closest comps to {yearin} {namein} are:\n{top_comps}')


Closest comps to 2024 Bucky Irving are:
                  Player  Season       Fppg   Age
913           Nick Chubb    2021  15.521429  26.0
925     David Montgomery    2021  15.307692  24.0
945      Elijah Mitchell    2021  15.000000  23.0
540         Chris Carson    2018  14.671429  24.0
148         Lamar Miller    2015  14.618750  24.0
406      Devonta Freeman    2017  14.442857  25.0
655            Joe Mixon    2019  14.087500  23.0
37         C.J. Anderson    2014  14.086667  23.0
33           Jeremy Hill    2014  13.431250  22.0
1184  Kenneth Walker III    2023  13.293333  23.0
285       Isaiah Crowell    2016  13.068750  23.0
291          Todd Gurley    2016  12.512500  22.0
1293       D'Andre Swift    2024  12.500000  25.0
1040      Travis Etienne    2022  12.300000  23.0
918     Javonte Williams    2021  12.170588  21.0
317          Jeremy Hill    2016  11.753333  24.0


In [125]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import math as m

X = df_encoded.drop(columns=['Awards','Player','Fppg', 'Fpts', 'Rush_Yds', 'Yds', 'Rush_TD', 'TD', 'Rec'])
y = df_encoded['Fppg']

X_train, X_test, y_train, y_test = train_test_split(X,y)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = m.sqrt(mean_squared_error(y_test, y_pred))
r2=r2_score(y_test,y_pred)

print(f'Rmse: {rmse:.2f}')
print(f'R2: {r2:.2f}')

Rmse: 1.04
R2: 0.92


## Predicting Next Season's Fantasy Performance based on past seasons with Regression
Works on NFL vets that have more seasons under their belt

In [155]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import math as m

#sort by player and season, then shift FPPG up by a number of seasons
lag_features = ['Fppg', 'Rush_Yds', 'Rec', 'TD', 'Tgt', 'Yds', 'Rush_TD', 'Rush_Att']
n_lags = 4
df_lag = df_encoded.sort_values(['Player','Season']).reset_index(drop=True)

#Feature engineering relevant columns for regression
df_lag['Age_squared'] = df_lag['Age']**2
df_lag = df_lag.drop(columns=[col for col in df_lag.columns if col.startswith('Team_')])

for lag in range(1, n_lags+1):
    for col in lag_features:
        df_lag[f'{col}_lag{lag}'] = df_lag.groupby('Player')[col].shift(lag)

df_lag['Fppg_growth'] = (df_lag['Fppg_lag1'] - df_lag['Fppg_lag2']) / df_lag['Fppg_lag2']
df_lag['Total_Touches'] = df_lag['Rec_lag1'] + df_lag['Rush_Att_lag1'] + df_lag['Tgt_lag1']
df_lag['Pts_per_Touch'] = df_lag['Fppg_lag1'] / (df_lag['Total_Touches'] + 0.01)
df_lag['Fppg_diff'] = df_lag['Fppg_lag1'] - df_lag['Fppg_lag2']
df_lag['Target_Share'] = df_lag['Tgt_lag1'] / (df_lag['Tgt_lag1'] + df_lag['Rec_lag1'] + df_lag['Rush_Att_lag1'] + 0.01)
df_lag['Fppg_pct_change'] = (df_lag['Fppg_lag1'] - df_lag['Fppg_lag2']) / df_lag['Fppg_lag2']
df_lag['Fppg_next'] = df_lag.groupby('Player')['Fppg'].shift(-1)

#make a copy of df_lag before drops for 2025 predictions
df_features = df_lag.copy()

df_lag = df_lag.dropna(subset=[f'{col}_lag{lag}' for col in lag_features for lag in range(1,n_lags+1)] + ['Fppg_next'])


# Fitting and Training a Model
X = df_lag.drop(columns=['Player','Awards','Fppg','Fppg_next','Fpts'])
y = df_lag['Fppg_next']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = m.sqrt(mean_squared_error(y_test, y_pred))
r2=r2_score(y_test,y_pred)

print(f'Rmse: {rmse:.2f}')
print(f'R2: {r2:.2f}')

def predict_player(player_name, season, feature_cols = X_train.columns, model = model, df = df_features):
    player_row = df[(df['Player'] == player_name) & (df['Season'] == season-1)]

    if player_row.empty:
        return f'No Data found for {player_name} in {season}'
    
    X_player = player_row[feature_cols]

    prediction = model.predict(X_player)[0]

    return f'Predicted Fantasy points per game for {player_name} in {season} is {prediction:.2f}'

predict_player('Saquon Barkley', 2025)


Rmse: 2.62
R2: 0.21


'Predicted Fantasy points per game for Saquon Barkley in 2025 is 17.68'