In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Import Quarterback performance dataset

Load data

In [2]:
career_stats_passing = pd.read_csv("Career_Stats_Passing.csv")

Collect data and clean data

In [3]:
qb_performance = career_stats_passing.copy()

In [4]:
qb_performance

Unnamed: 0,Player Id,Name,Position,Year,Team,Games Played,Passes Attempted,Passes Completed,Completion Percentage,Pass Attempts Per Game,...,TD Passes,Percentage of TDs per Attempts,Ints,Int Rate,Longest Pass,Passes Longer than 20 Yards,Passes Longer than 40 Yards,Sacks,Sacked Yards Lost,Passer Rating
0,tomfarris/2513861,"Farris, Tom",,1948,Chicago Rockets,0,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
1,tomfarris/2513861,"Farris, Tom",,1947,Chicago Bears,9,2,0,0.0,0.2,...,0,0.0,0,0.0,--,0,0,0,0,39.6
2,tomfarris/2513861,"Farris, Tom",,1946,Chicago Bears,11,21,8,38.1,1.9,...,1,4.8,2,9.5,--,0,0,0,0,31.5
3,billdemory/2512778,"Demory, Bill",,1974,New York Jets,1,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
4,billdemory/2512778,"Demory, Bill",,1973,New York Jets,6,39,12,30.8,6.5,...,2,5.1,8,20.5,--,0,0,8,73,22.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8520,billanderson/2508534,"Anderson, Bill",,1962,Washington Redskins,12,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8521,billanderson/2508534,"Anderson, Bill",,1961,Washington Redskins,14,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8522,billanderson/2508534,"Anderson, Bill",,1960,Washington Redskins,12,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0
8523,billanderson/2508534,"Anderson, Bill",,1959,Washington Redskins,11,--,--,--,0.0,...,--,--,--,--,--,--,--,--,--,0.0


Convert columns to numeric

In [7]:
numerical_columns = ['Passes Completed', 'Passes Attempted', 'Passing Yards', 'TD Passes', 'Ints']
for col in numerical_columns:
    qb_performance[col] = pd.to_numeric(qb_performance[col], errors='coerce')

## Feature Engineering

Create new features that could potentially improve the model's performance

In [8]:
qb_performance['Completion Percentage'] = qb_performance['Passes Completed'] / qb_performance['Passes Attempted']
qb_performance['Yards per Attempt'] = qb_performance['Passing Yards'] / qb_performance['Passes Attempted']
qb_performance['Touchdown per Attempt'] = qb_performance['TD Passes'] / qb_performance['Passes Attempted']
qb_performance['Interception per Attempt'] = qb_performance['Ints'] / qb_performance['Passes Attempted']
qb_performance['Passing Efficiency'] = qb_performance['Completion Percentage'] * qb_performance['Yards per Attempt']
qb_performance['TD to INT Ratio'] = qb_performance['TD Passes'] / qb_performance['Ints']
qb_performance['TD to Completion Ratio'] = qb_performance['TD Passes'] / qb_performance['Passes Completed']
qb_performance['INT to Completion Ratio'] = qb_performance['Ints'] / qb_performance['Passes Completed']

Replace infinity values with NaN and fill any NaN values with 0

In [9]:
qb_performance.replace([np.inf, -np.inf], np.nan, inplace=True)

In [10]:
qb_performance.fillna(0, inplace=True)

Group by player and year to calculate the average performance over time

In [11]:
qb_performance_grouped = qb_performance.groupby(['Player Id', 'Year']).mean().reset_index()

In [12]:
qb_performance_grouped

Unnamed: 0,Player Id,Year,Games Played,Passes Attempted,Passes Completed,Completion Percentage,Pass Attempts Per Game,Passing Yards,TD Passes,Ints,Passer Rating,Yards per Attempt,Touchdown per Attempt,Interception per Attempt,Passing Efficiency,TD to INT Ratio,TD to Completion Ratio,INT to Completion Ratio
0,a.j.feeley/2504566,2001,1.0,14.0,10.0,0.714286,14.0,143.0,2.0,1.0,114.0,10.214286,0.142857,0.071429,7.295918,2.000000,0.200000,0.100000
1,a.j.feeley/2504566,2002,6.0,154.0,86.0,0.558442,25.7,0.0,6.0,5.0,75.4,0.000000,0.038961,0.032468,0.000000,1.200000,0.069767,0.058140
2,a.j.feeley/2504566,2003,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,a.j.feeley/2504566,2004,11.0,356.0,191.0,0.536517,32.4,0.0,11.0,15.0,61.7,0.000000,0.030899,0.042135,0.000000,0.733333,0.057592,0.078534
4,a.j.feeley/2504566,2005,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8291,yalelary/2519113,1960,12.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8292,yalelary/2519113,1961,14.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8293,yalelary/2519113,1962,14.0,1.0,0.0,0.000000,0.1,0.0,0.0,0.0,39.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8294,yalelary/2519113,1963,10.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### Select features and target columns

In [33]:
feature_columns = ['Completion Percentage', 'Yards per Attempt', 'Touchdown per Attempt', 'Interception per Attempt',
                   'Passing Efficiency', 'TD to INT Ratio', 'TD to Completion Ratio', 'INT to Completion Ratio']
target_column = 'Passer Rating'

# Train Gradient Boosting Regressor

In [34]:
X = qb_performance[['Player Id'] + feature_columns]
y = qb_performance[target_column]

Split data into train and test sets

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Remove 'Player Id' from the training data

In [36]:
X_train = X_train.drop(columns=['Player Id'])

Store and remove 'Player Id' from the test data

In [37]:
player_ids_test = X_test['Player Id'].values
X_test = X_test.drop(columns=['Player Id'])

Scale features

In [38]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
gb_reg = GradientBoostingRegressor(random_state=42)
gb_reg.fit(X_train_scaled, y_train.values.ravel())

Predict and evaluate model

In [40]:
y_pred = gb_reg.predict(X_test_scaled)
y_pred

array([118.76947261,  83.28498868,   4.88227712, ...,   4.88227712,
       100.5325896 ,  52.6176504 ])

### Evaluate the model's performance

In [41]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [43]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 91.75
Root Mean Squared Error: 9.58
R-squared: 0.95


### Display the first 10 predictions and actual passer ratings

In [46]:
player_id_to_name = qb_performance[['Player Id', 'Name']].set_index('Player Id').to_dict()['Name']
results_df = pd.DataFrame({"Player Id": player_ids_test,
                           "Name": [player_id_to_name[player_id] for player_id in player_ids_test],
                           "Actual Passer Rating": y_test.values.ravel(),
                           "Predicted Passer Rating": y_pred.ravel()})

In [47]:
results_df

Unnamed: 0,Player Id,Name,Actual Passer Rating,Predicted Passer Rating
0,joewashington/2528291,"Washington, Joe",118.8,118.769473
1,richgannon/2500754,"Gannon, Rich",79.8,83.284989
2,donmccauley/2520537,"McCauley, Don",0.0,4.882277
3,yalelary/2519113,"Lary, Yale",0.0,4.882277
4,demaryiusthomas/497328,"Thomas, Demaryius",0.0,4.882277
...,...,...,...,...
1700,vinceevans/2500575,"Evans, Vince",77.7,77.399439
1701,henrymarshall/2520201,"Marshall, Henry",0.0,4.882277
1702,bradmaynard/2501931,"Maynard, Brad",0.0,4.882277
1703,aaronrodgers/2506363,"Rodgers, Aaron",104.9,100.532590
